summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHenry Conklin <henrywconklin@gmail.com>2021-02-11 09:33:55 -0600
committerHenry Conklin <henrywconklin@gmail.com>2021-02-14 11:00:25 -0600
commitc6a911f0379b429691aa0dca6615523892bbfb06 (patch)
tree40b85fbb5f85cca9d99bfc91123a600a6aa62a4a
parente5bb89cdd5e92fa6fdeff78aad08bf0cbfbcc692 (diff)
Add support for numeric XML entities to XMLParser
* Add support for decimal numeric entities to String::xml_unescape * Add more error checks to String::xml_unescape * Refactor XMLParser to use String::xml_unescape instead of an internal implementation
-rw-r--r--core/io/xml_parser.cpp66
-rw-r--r--core/io/xml_parser.h2
-rw-r--r--core/string/ustring.cpp62
-rw-r--r--tests/test_main.cpp1
-rw-r--r--tests/test_string.h46
-rw-r--r--tests/test_xml_parser.h74
6 files changed, 169 insertions, 82 deletions
diff --git a/core/io/xml_parser.cpp b/core/io/xml_parser.cpp
index 905be6089d..1574634aad 100644
--- a/core/io/xml_parser.cpp
+++ b/core/io/xml_parser.cpp
@@ -36,63 +36,6 @@
VARIANT_ENUM_CAST(XMLParser::NodeType);
-static bool _equalsn(const char32_t *str1, const char32_t *str2, int len) {
- int i;
- for (i = 0; i < len && str1[i] && str2[i]; ++i) {
- if (str1[i] != str2[i]) {
- return false;
- }
- }
-
- // if one (or both) of the strings was smaller then they
- // are only equal if they have the same length
- return (i == len) || (str1[i] == 0 && str2[i] == 0);
-}
-
-String XMLParser::_replace_special_characters(const String &origstr) {
- int pos = origstr.find("&");
- int oldPos = 0;
-
- if (pos == -1) {
- return origstr;
- }
-
- String newstr;
-
- while (pos != -1 && pos < origstr.length() - 2) {
- // check if it is one of the special characters
-
- int specialChar = -1;
- for (int i = 0; i < (int)special_characters.size(); ++i) {
- const char32_t *p = &origstr[pos] + 1;
-
- if (_equalsn(&special_characters[i][1], p, special_characters[i].length() - 1)) {
- specialChar = i;
- break;
- }
- }
-
- if (specialChar != -1) {
- newstr += (origstr.substr(oldPos, pos - oldPos));
- newstr += (special_characters[specialChar][0]);
- pos += special_characters[specialChar].length();
- } else {
- newstr += (origstr.substr(oldPos, pos - oldPos + 1));
- pos += 1;
- }
-
- // find next &
- oldPos = pos;
- pos = origstr.find("&", pos);
- }
-
- if (oldPos < origstr.length() - 1) {
- newstr += (origstr.substr(oldPos, origstr.length() - oldPos));
- }
-
- return newstr;
-}
-
static inline bool _is_white_space(char c) {
return (c == ' ' || c == '\t' || c == '\n' || c == '\r');
}
@@ -116,7 +59,7 @@ bool XMLParser::_set_text(char *start, char *end) {
// set current text to the parsed text, and replace xml special characters
String s = String::utf8(start, (int)(end - start));
- node_name = _replace_special_characters(s);
+ node_name = s.xml_unescape();
// current XML node type is text
node_type = NODE_TEXT;
@@ -292,7 +235,7 @@ void XMLParser::_parse_opening_xml_element() {
String s = String::utf8(attributeValueBegin,
(int)(attributeValueEnd - attributeValueBegin));
- attr.value = _replace_special_characters(s);
+ attr.value = s.xml_unescape();
attributes.push_back(attr);
} else {
// tag is closed directly
@@ -555,11 +498,6 @@ int XMLParser::get_current_line() const {
}
XMLParser::XMLParser() {
- special_characters.push_back("&amp;");
- special_characters.push_back("<lt;");
- special_characters.push_back(">gt;");
- special_characters.push_back("\"quot;");
- special_characters.push_back("'apos;");
}
XMLParser::~XMLParser() {
diff --git a/core/io/xml_parser.h b/core/io/xml_parser.h
index 01af6a90ad..847edf958d 100644
--- a/core/io/xml_parser.h
+++ b/core/io/xml_parser.h
@@ -68,8 +68,6 @@ private:
char *data = nullptr;
char *P = nullptr;
uint64_t length = 0;
- void unescape(String &p_str);
- Vector<String> special_characters;
String node_name;
bool node_empty = false;
NodeType node_type = NODE_NONE;
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index 59fda65d43..a57c7b2504 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -3888,25 +3888,55 @@ static _FORCE_INLINE_ int _xml_unescape(const char32_t *p_src, int p_src_len, ch
if (p_src_len >= 4 && p_src[1] == '#') {
char32_t c = 0;
-
- for (int i = 2; i < p_src_len; i++) {
- eat = i + 1;
- char32_t ct = p_src[i];
- if (ct == ';') {
- break;
- } else if (ct >= '0' && ct <= '9') {
- ct = ct - '0';
- } else if (ct >= 'a' && ct <= 'f') {
- ct = (ct - 'a') + 10;
- } else if (ct >= 'A' && ct <= 'F') {
- ct = (ct - 'A') + 10;
- } else {
- continue;
+ bool overflow = false;
+ if (p_src[2] == 'x') {
+ // Hex entity &#x<num>;
+ for (int i = 3; i < p_src_len; i++) {
+ eat = i + 1;
+ char32_t ct = p_src[i];
+ if (ct == ';') {
+ break;
+ } else if (ct >= '0' && ct <= '9') {
+ ct = ct - '0';
+ } else if (ct >= 'a' && ct <= 'f') {
+ ct = (ct - 'a') + 10;
+ } else if (ct >= 'A' && ct <= 'F') {
+ ct = (ct - 'A') + 10;
+ } else {
+ break;
+ }
+ if (c > (UINT32_MAX >> 4)) {
+ overflow = true;
+ break;
+ }
+ c <<= 4;
+ c |= ct;
+ }
+ } else {
+ // Decimal entity &#<num>;
+ for (int i = 2; i < p_src_len; i++) {
+ eat = i + 1;
+ char32_t ct = p_src[i];
+ if (ct == ';' || ct < '0' || ct > '9') {
+ break;
+ }
+ }
+ if (p_src[eat - 1] == ';') {
+ int64_t val = String::to_int(p_src + 2, eat - 3);
+ if (val > 0 && val <= UINT32_MAX) {
+ c = (char32_t)val;
+ } else {
+ overflow = true;
+ }
}
- c <<= 4;
- c |= ct;
}
+ // Value must be non-zero, in the range of char32_t,
+ // actually end with ';'. If invalid, leave the entity as-is
+ if (c == '\0' || overflow || p_src[eat - 1] != ';') {
+ eat = 1;
+ c = *p_src;
+ }
if (p_dst) {
*p_dst = c;
}
diff --git a/tests/test_main.cpp b/tests/test_main.cpp
index 2697eb6399..ad099604c1 100644
--- a/tests/test_main.cpp
+++ b/tests/test_main.cpp
@@ -70,6 +70,7 @@
#include "test_text_server.h"
#include "test_validate_testing.h"
#include "test_variant.h"
+#include "test_xml_parser.h"
#include "modules/modules_tests.gen.h"
diff --git a/tests/test_string.h b/tests/test_string.h
index cc3152203e..17f24fb0d8 100644
--- a/tests/test_string.h
+++ b/tests/test_string.h
@@ -1166,6 +1166,52 @@ TEST_CASE("[String] xml_escape/unescape") {
CHECK(s.xml_escape(false).xml_unescape() == s);
}
+TEST_CASE("[String] xml_unescape") {
+ // Named entities
+ String input = "&quot;&amp;&apos;&lt;&gt;";
+ CHECK(input.xml_unescape() == "\"&\'<>");
+
+ // Numeric entities
+ input = "&#x41;&#66;";
+ CHECK(input.xml_unescape() == "AB");
+
+ input = "&#0;&x#0;More text";
+ String result = input.xml_unescape();
+ // Didn't put in a leading NUL and terminate the string
+ CHECK(input.length() > 0);
+ CHECK(input[0] != '\0');
+ // Entity should be left as-is if invalid
+ CHECK(input.xml_unescape() == input);
+
+ // Check near char32_t range
+ input = "&#xFFFFFFFF;";
+ result = input.xml_unescape();
+ CHECK(result.length() == 1);
+ CHECK(result[0] == 0xFFFFFFFF);
+ input = "&#4294967295;";
+ result = input.xml_unescape();
+ CHECK(result.length() == 1);
+ CHECK(result[0] == 0xFFFFFFFF);
+
+ // Check out of range of char32_t
+ input = "&#xFFFFFFFFF;";
+ CHECK(input.xml_unescape() == input);
+ input = "&#4294967296;";
+ CHECK(input.xml_unescape() == input);
+
+ // Shouldn't consume without ending in a ';'
+ input = "&#66";
+ CHECK(input.xml_unescape() == input);
+ input = "&#x41";
+ CHECK(input.xml_unescape() == input);
+
+ // Invalid characters should make the entity ignored
+ input = "&#x41SomeIrrelevantText;";
+ CHECK(input.xml_unescape() == input);
+ input = "&#66SomeIrrelevantText;";
+ CHECK(input.xml_unescape() == input);
+}
+
TEST_CASE("[String] Strip escapes") {
String s = "\t\tTest Test\r\n Test";
CHECK(s.strip_escapes() == "Test Test Test");
diff --git a/tests/test_xml_parser.h b/tests/test_xml_parser.h
new file mode 100644
index 0000000000..55de048d6a
--- /dev/null
+++ b/tests/test_xml_parser.h
@@ -0,0 +1,74 @@
+/*************************************************************************/
+/* test_xml_parser.h */
+/*************************************************************************/
+/* This file is part of: */
+/* GODOT ENGINE */
+/* https://godotengine.org */
+/*************************************************************************/
+/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */
+/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */
+/* */
+/* Permission is hereby granted, free of charge, to any person obtaining */
+/* a copy of this software and associated documentation files (the */
+/* "Software"), to deal in the Software without restriction, including */
+/* without limitation the rights to use, copy, modify, merge, publish, */
+/* distribute, sublicense, and/or sell copies of the Software, and to */
+/* permit persons to whom the Software is furnished to do so, subject to */
+/* the following conditions: */
+/* */
+/* The above copyright notice and this permission notice shall be */
+/* included in all copies or substantial portions of the Software. */
+/* */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
+/*************************************************************************/
+
+#ifndef TEST_XML_PARSER_H
+#define TEST_XML_PARSER_H
+
+#include <inttypes.h>
+
+#include "core/io/xml_parser.h"
+#include "core/string/ustring.h"
+
+#include "tests/test_macros.h"
+
+namespace TestXMLParser {
+TEST_CASE("[XMLParser] End-to-end") {
+ String source = "<?xml version = \"1.0\" encoding=\"UTF-8\" ?>\
+<top attr=\"attr value\">\
+ Text&lt;&#65;&#x42;&gt;\
+</top>";
+ Vector<uint8_t> buff = source.to_utf8_buffer();
+
+ XMLParser parser;
+ parser.open_buffer(buff);
+
+ // <?xml ...?> gets parsed as NODE_UNKNOWN
+ CHECK(parser.read() == OK);
+ CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_UNKNOWN);
+
+ CHECK(parser.read() == OK);
+ CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_ELEMENT);
+ CHECK(parser.get_node_name() == "top");
+ CHECK(parser.has_attribute("attr"));
+ CHECK(parser.get_attribute_value("attr") == "attr value");
+
+ CHECK(parser.read() == OK);
+ CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_TEXT);
+ CHECK(parser.get_node_data().lstrip(" \t") == "Text<AB>");
+
+ CHECK(parser.read() == OK);
+ CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_ELEMENT_END);
+ CHECK(parser.get_node_name() == "top");
+
+ parser.close();
+}
+} // namespace TestXMLParser
+
+#endif // TEST_XML_PARSER_H