summaryrefslogtreecommitdiff
path: root/core/string
diff options
context:
space:
mode:
Diffstat (limited to 'core/string')
-rw-r--r--core/string/node_path.cpp15
-rw-r--r--core/string/node_path.h2
-rw-r--r--core/string/print_string.cpp95
-rw-r--r--core/string/print_string.h12
-rw-r--r--core/string/ustring.cpp393
-rw-r--r--core/string/ustring.h7
6 files changed, 371 insertions, 153 deletions
diff --git a/core/string/node_path.cpp b/core/string/node_path.cpp
index 238897c2b1..30fa434fad 100644
--- a/core/string/node_path.cpp
+++ b/core/string/node_path.cpp
@@ -199,6 +199,21 @@ Vector<StringName> NodePath::get_subnames() const {
return Vector<StringName>();
}
+StringName NodePath::get_concatenated_names() const {
+ ERR_FAIL_COND_V(!data, StringName());
+
+ if (!data->concatenated_path) {
+ int pc = data->path.size();
+ String concatenated;
+ const StringName *sn = data->path.ptr();
+ for (int i = 0; i < pc; i++) {
+ concatenated += i == 0 ? sn[i].operator String() : "/" + sn[i];
+ }
+ data->concatenated_path = concatenated;
+ }
+ return data->concatenated_path;
+}
+
StringName NodePath::get_concatenated_subnames() const {
ERR_FAIL_COND_V(!data, StringName());
diff --git a/core/string/node_path.h b/core/string/node_path.h
index 53976bd524..2bce33e21e 100644
--- a/core/string/node_path.h
+++ b/core/string/node_path.h
@@ -39,6 +39,7 @@ class NodePath {
SafeRefCount refcount;
Vector<StringName> path;
Vector<StringName> subpath;
+ StringName concatenated_path;
StringName concatenated_subpath;
bool absolute;
bool has_slashes;
@@ -59,6 +60,7 @@ public:
StringName get_subname(int p_idx) const;
Vector<StringName> get_names() const;
Vector<StringName> get_subnames() const;
+ StringName get_concatenated_names() const;
StringName get_concatenated_subnames() const;
NodePath rel_path_to(const NodePath &p_np) const;
diff --git a/core/string/print_string.cpp b/core/string/print_string.cpp
index 919c9e08e3..f58486e0a5 100644
--- a/core/string/print_string.cpp
+++ b/core/string/print_string.cpp
@@ -79,7 +79,98 @@ void __print_line(String p_string) {
_global_lock();
PrintHandlerList *l = print_handler_list;
while (l) {
- l->printfunc(l->userdata, p_string, false);
+ l->printfunc(l->userdata, p_string, false, false);
+ l = l->next;
+ }
+
+ _global_unlock();
+}
+
+void __print_line_rich(String p_string) {
+ if (!_print_line_enabled) {
+ return;
+ }
+
+ // Convert a subset of BBCode tags to ANSI escape codes for correct display in the terminal.
+ // Support of those ANSI escape codes varies across terminal emulators,
+ // especially for italic and strikethrough.
+ String p_string_ansi = p_string;
+
+ p_string_ansi = p_string_ansi.replace("[b]", "\u001b[1m");
+ p_string_ansi = p_string_ansi.replace("[/b]", "\u001b[22m");
+ p_string_ansi = p_string_ansi.replace("[i]", "\u001b[3m");
+ p_string_ansi = p_string_ansi.replace("[/i]", "\u001b[23m");
+ p_string_ansi = p_string_ansi.replace("[u]", "\u001b[4m");
+ p_string_ansi = p_string_ansi.replace("[/u]", "\u001b[24m");
+ p_string_ansi = p_string_ansi.replace("[s]", "\u001b[9m");
+ p_string_ansi = p_string_ansi.replace("[/s]", "\u001b[29m");
+
+ p_string_ansi = p_string_ansi.replace("[indent]", " ");
+ p_string_ansi = p_string_ansi.replace("[/indent]", "");
+ p_string_ansi = p_string_ansi.replace("[code]", "\u001b[2m");
+ p_string_ansi = p_string_ansi.replace("[/code]", "\u001b[22m");
+ p_string_ansi = p_string_ansi.replace("[url]", "");
+ p_string_ansi = p_string_ansi.replace("[/url]", "");
+ p_string_ansi = p_string_ansi.replace("[center]", "\n\t\t\t");
+ p_string_ansi = p_string_ansi.replace("[/center]", "");
+ p_string_ansi = p_string_ansi.replace("[right]", "\n\t\t\t\t\t\t");
+ p_string_ansi = p_string_ansi.replace("[/right]", "");
+
+ if (p_string_ansi.contains("[color")) {
+ p_string_ansi = p_string_ansi.replace("[color=black]", "\u001b[30m");
+ p_string_ansi = p_string_ansi.replace("[color=red]", "\u001b[91m");
+ p_string_ansi = p_string_ansi.replace("[color=green]", "\u001b[92m");
+ p_string_ansi = p_string_ansi.replace("[color=lime]", "\u001b[92m");
+ p_string_ansi = p_string_ansi.replace("[color=yellow]", "\u001b[93m");
+ p_string_ansi = p_string_ansi.replace("[color=blue]", "\u001b[94m");
+ p_string_ansi = p_string_ansi.replace("[color=magenta]", "\u001b[95m");
+ p_string_ansi = p_string_ansi.replace("[color=pink]", "\u001b[38;5;218m");
+ p_string_ansi = p_string_ansi.replace("[color=purple]", "\u001b[38;5;98m");
+ p_string_ansi = p_string_ansi.replace("[color=cyan]", "\u001b[96m");
+ p_string_ansi = p_string_ansi.replace("[color=white]", "\u001b[97m");
+ p_string_ansi = p_string_ansi.replace("[color=orange]", "\u001b[38;5;208m");
+ p_string_ansi = p_string_ansi.replace("[color=gray]", "\u001b[90m");
+ p_string_ansi = p_string_ansi.replace("[/color]", "\u001b[39m");
+ }
+ if (p_string_ansi.contains("[bgcolor")) {
+ p_string_ansi = p_string_ansi.replace("[bgcolor=black]", "\u001b[40m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=red]", "\u001b[101m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=green]", "\u001b[102m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=lime]", "\u001b[102m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=yellow]", "\u001b[103m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=blue]", "\u001b[104m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=magenta]", "\u001b[105m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=pink]", "\u001b[48;5;218m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=purple]", "\u001b[48;5;98m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=cyan]", "\u001b[106m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=white]", "\u001b[107m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=orange]", "\u001b[48;5;208m");
+ p_string_ansi = p_string_ansi.replace("[bgcolor=gray]", "\u001b[100m");
+ p_string_ansi = p_string_ansi.replace("[/bgcolor]", "\u001b[49m");
+ }
+ if (p_string_ansi.contains("[fgcolor")) {
+ p_string_ansi = p_string_ansi.replace("[fgcolor=black]", "\u001b[30;40m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=red]", "\u001b[91;101m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=green]", "\u001b[92;102m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=lime]", "\u001b[92;102m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=yellow]", "\u001b[93;103m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=blue]", "\u001b[94;104m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=magenta]", "\u001b[95;105m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=pink]", "\u001b[38;5;218;48;5;218m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=purple]", "\u001b[38;5;98;48;5;98m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=cyan]", "\u001b[96;106m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=white]", "\u001b[97;107m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=orange]", "\u001b[38;5;208;48;5;208m");
+ p_string_ansi = p_string_ansi.replace("[fgcolor=gray]", "\u001b[90;100m");
+ p_string_ansi = p_string_ansi.replace("[/fgcolor]", "\u001b[39;49m");
+ }
+
+ OS::get_singleton()->print_rich("%s\n", p_string_ansi.utf8().get_data());
+
+ _global_lock();
+ PrintHandlerList *l = print_handler_list;
+ while (l) {
+ l->printfunc(l->userdata, p_string, false, true);
l = l->next;
}
@@ -96,7 +187,7 @@ void print_error(String p_string) {
_global_lock();
PrintHandlerList *l = print_handler_list;
while (l) {
- l->printfunc(l->userdata, p_string, true);
+ l->printfunc(l->userdata, p_string, true, false);
l = l->next;
}
diff --git a/core/string/print_string.h b/core/string/print_string.h
index f7d0f25030..823e2c29e8 100644
--- a/core/string/print_string.h
+++ b/core/string/print_string.h
@@ -35,7 +35,7 @@
extern void (*_print_func)(String);
-typedef void (*PrintHandlerFunc)(void *, const String &p_string, bool p_error);
+typedef void (*PrintHandlerFunc)(void *, const String &p_string, bool p_error, bool p_rich);
struct PrintHandlerList {
PrintHandlerFunc printfunc = nullptr;
@@ -59,6 +59,7 @@ void remove_print_handler(const PrintHandlerList *p_handler);
extern bool _print_line_enabled;
extern bool _print_error_enabled;
extern void __print_line(String p_string);
+extern void __print_line_rich(String p_string);
extern void print_error(String p_string);
extern void print_verbose(String p_string);
@@ -66,9 +67,18 @@ inline void print_line(Variant v) {
__print_line(stringify_variants(v));
}
+inline void print_line_rich(Variant v) {
+ __print_line_rich(stringify_variants(v));
+}
+
template <typename... Args>
void print_line(Variant p_var, Args... p_args) {
__print_line(stringify_variants(p_var, p_args...));
}
+template <typename... Args>
+void print_line_rich(Variant p_var, Args... p_args) {
+ __print_line_rich(stringify_variants(p_var, p_args...));
+}
+
#endif // PRINT_STRING_H
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index df1aae5370..beefe54faf 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -323,7 +323,13 @@ void String::copy_from(const char *p_cstr) {
char32_t *dst = this->ptrw();
for (size_t i = 0; i <= len; i++) {
- dst[i] = p_cstr[i];
+ uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
+ if (c == 0 && i < len) {
+ print_unicode_error("NUL character", true);
+ dst[i] = 0x20;
+ } else {
+ dst[i] = c;
+ }
}
}
@@ -350,7 +356,13 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) {
char32_t *dst = this->ptrw();
for (int i = 0; i < len; i++) {
- dst[i] = p_cstr[i];
+ uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
+ if (c == 0) {
+ print_unicode_error("NUL character", true);
+ dst[i] = 0x20;
+ } else {
+ dst[i] = c;
+ }
}
dst[len] = 0;
}
@@ -376,14 +388,21 @@ void String::copy_from(const wchar_t *p_cstr, const int p_clip_to) {
}
void String::copy_from(const char32_t &p_char) {
+ if (p_char == 0) {
+ print_unicode_error("NUL character", true);
+ return;
+ }
+ if ((p_char & 0xfffff800) == 0xd800) {
+ print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
+ }
+ if (p_char > 0x10ffff) {
+ print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
+ }
+
resize(2);
+
char32_t *dst = ptrw();
- if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) {
- print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + ".");
- dst[0] = 0xfffd;
- } else {
- dst[0] = p_char;
- }
+ dst[0] = p_char;
dst[1] = 0;
}
@@ -437,12 +456,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) {
dst[p_length] = 0;
for (int i = 0; i < p_length; i++) {
- if ((p_char[i] >= 0xd800 && p_char[i] <= 0xdfff) || (p_char[i] > 0x10ffff)) {
- print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char[i], 16) + ".");
- dst[i] = 0xfffd;
- } else {
- dst[i] = p_char[i];
+ if (p_char[i] == 0) {
+ print_unicode_error("NUL character", true);
+ dst[i] = 0x20;
+ continue;
+ }
+ if ((p_char[i] & 0xfffff800) == 0xd800) {
+ print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i]));
}
+ if (p_char[i] > 0x10ffff) {
+ print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i]));
+ }
+ dst[i] = p_char[i];
}
}
@@ -481,7 +506,7 @@ String operator+(const wchar_t *p_chr, const String &p_str) {
// wchar_t is 16-bit
String tmp = String::utf16((const char16_t *)p_chr);
#else
- // wchar_t is 32-bi
+ // wchar_t is 32-bit
String tmp = (const char32_t *)p_chr;
#endif
tmp += p_str;
@@ -527,7 +552,13 @@ String &String::operator+=(const char *p_str) {
char32_t *dst = ptrw() + lhs_len;
for (size_t i = 0; i <= rhs_len; i++) {
- dst[i] = p_str[i];
+ uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]);
+ if (c == 0 && i < rhs_len) {
+ print_unicode_error("NUL character", true);
+ dst[i] = 0x20;
+ } else {
+ dst[i] = c;
+ }
}
return *this;
@@ -550,15 +581,21 @@ String &String::operator+=(const char32_t *p_str) {
}
String &String::operator+=(char32_t p_char) {
+ if (p_char == 0) {
+ print_unicode_error("NUL character", true);
+ return *this;
+ }
+ if ((p_char & 0xfffff800) == 0xd800) {
+ print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
+ }
+ if (p_char > 0x10ffff) {
+ print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
+ }
+
const int lhs_len = length();
resize(lhs_len + 2);
char32_t *dst = ptrw();
- if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) {
- print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + ".");
- dst[lhs_len] = 0xfffd;
- } else {
- dst[lhs_len] = p_char;
- }
+ dst[lhs_len] = p_char;
dst[lhs_len + 1] = 0;
return *this;
@@ -1583,6 +1620,14 @@ String String::hex_encode_buffer(const uint8_t *p_buffer, int p_len) {
return ret;
}
+void String::print_unicode_error(const String &p_message, bool p_critical) const {
+ if (p_critical) {
+ print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message));
+ } else {
+ print_error(vformat("Unicode parsing error: %s", p_message));
+ }
+}
+
CharString String::ascii(bool p_allow_extended) const {
if (!length()) {
return CharString();
@@ -1596,7 +1641,7 @@ CharString String::ascii(bool p_allow_extended) const {
if ((c <= 0x7f) || (c <= 0xff && p_allow_extended)) {
cs[i] = c;
} else {
- print_error("Unicode parsing error: Cannot represent " + num_int64(c, 16) + " as ASCII/Latin-1 character.");
+ print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c));
cs[i] = 0x20;
}
}
@@ -1611,11 +1656,9 @@ String String::utf8(const char *p_utf8, int p_len) {
return ret;
}
-bool String::parse_utf8(const char *p_utf8, int p_len) {
-#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-8?");
-
+Error String::parse_utf8(const char *p_utf8, int p_len) {
if (!p_utf8) {
- return true;
+ return ERR_INVALID_DATA;
}
String aux;
@@ -1635,14 +1678,17 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
}
}
+ bool decode_error = false;
+ bool decode_failed = false;
{
const char *ptrtmp = p_utf8;
const char *ptrtmp_limit = &p_utf8[p_len];
int skip = 0;
+ uint8_t c_start = 0;
while (ptrtmp != ptrtmp_limit && *ptrtmp) {
- if (skip == 0) {
- uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
+ uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
+ if (skip == 0) {
/* Determine the number of characters in sequence */
if ((c & 0x80) == 0) {
skip = 0;
@@ -1652,20 +1698,34 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
skip = 2;
} else if ((c & 0xf8) == 0xf0) {
skip = 3;
+ } else if ((c & 0xfc) == 0xf8) {
+ skip = 4;
+ } else if ((c & 0xfe) == 0xfc) {
+ skip = 5;
} else {
- UNICERROR("invalid skip at " + num_int64(cstr_size));
- return true; //invalid utf8
+ skip = 0;
+ print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
+ decode_failed = true;
}
+ c_start = c;
if (skip == 1 && (c & 0x1e) == 0) {
- UNICERROR("overlong rejected at " + num_int64(cstr_size));
- return true; //reject overlong
+ print_unicode_error(vformat("Overlong encoding (%x ...)", c));
+ decode_error = true;
}
-
str_size++;
-
} else {
- --skip;
+ if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) {
+ print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c));
+ decode_error = true;
+ }
+ if (c < 0x80 || c > 0xbf) {
+ print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true);
+ decode_failed = true;
+ skip = 0;
+ } else {
+ --skip;
+ }
}
cstr_size++;
@@ -1673,80 +1733,91 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
}
if (skip) {
- UNICERROR("no space left");
- return true; //not enough space
+ print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true);
+ decode_failed = true;
}
}
if (str_size == 0) {
clear();
- return false;
+ return OK; // empty string
}
resize(str_size + 1);
char32_t *dst = ptrw();
dst[str_size] = 0;
+ int skip = 0;
+ uint32_t unichar = 0;
while (cstr_size) {
- int len = 0;
-
- /* Determine the number of characters in sequence */
- if ((*p_utf8 & 0x80) == 0) {
- len = 1;
- } else if ((*p_utf8 & 0xe0) == 0xc0) {
- len = 2;
- } else if ((*p_utf8 & 0xf0) == 0xe0) {
- len = 3;
- } else if ((*p_utf8 & 0xf8) == 0xf0) {
- len = 4;
- } else {
- UNICERROR("invalid len");
- return true; //invalid UTF8
- }
-
- if (len > cstr_size) {
- UNICERROR("no space left");
- return true; //not enough space
- }
-
- if (len == 2 && (*p_utf8 & 0x1E) == 0) {
- UNICERROR("no space left");
- return true; //reject overlong
- }
-
- /* Convert the first character */
-
- uint32_t unichar = 0;
-
- if (len == 1) {
- unichar = *p_utf8;
+ uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8);
+
+ if (skip == 0) {
+ /* Determine the number of characters in sequence */
+ if ((c & 0x80) == 0) {
+ *(dst++) = c;
+ unichar = 0;
+ skip = 0;
+ } else if ((c & 0xe0) == 0xc0) {
+ unichar = (0xff >> 3) & c;
+ skip = 1;
+ } else if ((c & 0xf0) == 0xe0) {
+ unichar = (0xff >> 4) & c;
+ skip = 2;
+ } else if ((c & 0xf8) == 0xf0) {
+ unichar = (0xff >> 5) & c;
+ skip = 3;
+ } else if ((c & 0xfc) == 0xf8) {
+ unichar = (0xff >> 6) & c;
+ skip = 4;
+ } else if ((c & 0xfe) == 0xfc) {
+ unichar = (0xff >> 7) & c;
+ skip = 5;
+ } else {
+ *(dst++) = 0x20;
+ unichar = 0;
+ skip = 0;
+ }
} else {
- unichar = (0xff >> (len + 1)) & *p_utf8;
-
- for (int i = 1; i < len; i++) {
- if ((p_utf8[i] & 0xc0) != 0x80) {
- UNICERROR("invalid utf8");
- return true; //invalid utf8
- }
- if (unichar == 0 && i == 2 && ((p_utf8[i] & 0x7f) >> (7 - len)) == 0) {
- UNICERROR("invalid utf8 overlong");
- return true; //no overlong
+ if (c < 0x80 || c > 0xbf) {
+ *(dst++) = 0x20;
+ skip = 0;
+ } else {
+ unichar = (unichar << 6) | (c & 0x3f);
+ --skip;
+ if (skip == 0) {
+ if (unichar == 0) {
+ print_unicode_error("NUL character", true);
+ decode_failed = true;
+ unichar = 0x20;
+ }
+ if ((unichar & 0xfffff800) == 0xd800) {
+ print_unicode_error(vformat("Unpaired surrogate (%x)", unichar));
+ decode_error = true;
+ }
+ if (unichar > 0x10ffff) {
+ print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar));
+ decode_error = true;
+ }
+ *(dst++) = unichar;
}
- unichar = (unichar << 6) | (p_utf8[i] & 0x3f);
}
}
- if (unichar >= 0xd800 && unichar <= 0xdfff) {
- UNICERROR("invalid code point");
- return CharString();
- }
- *(dst++) = unichar;
- cstr_size -= len;
- p_utf8 += len;
+ cstr_size--;
+ p_utf8++;
+ }
+ if (skip) {
+ *(dst++) = 0x20;
}
- return false;
-#undef UNICERROR
+ if (decode_failed) {
+ return ERR_INVALID_DATA;
+ } else if (decode_error) {
+ return ERR_PARSE_ERROR;
+ } else {
+ return OK;
+ }
}
CharString String::utf8() const {
@@ -1765,15 +1836,17 @@ CharString String::utf8() const {
fl += 2;
} else if (c <= 0xffff) { // 16 bits
fl += 3;
- } else if (c <= 0x0010ffff) { // 21 bits
+ } else if (c <= 0x001fffff) { // 21 bits
fl += 4;
+ } else if (c <= 0x03ffffff) { // 26 bits
+ fl += 5;
+ print_unicode_error(vformat("Invalid unicode codepoint (%x)", c));
+ } else if (c <= 0x7fffffff) { // 31 bits
+ fl += 6;
+ print_unicode_error(vformat("Invalid unicode codepoint (%x)", c));
} else {
- print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
- return CharString();
- }
- if (c >= 0xd800 && c <= 0xdfff) {
- print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
- return CharString();
+ fl += 1;
+ print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-8", c), true);
}
}
@@ -1799,11 +1872,26 @@ CharString String::utf8() const {
APPEND_CHAR(uint32_t(0xe0 | ((c >> 12) & 0x0f))); // Top 4 bits.
APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Middle 6 bits.
APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
- } else { // 21 bits
+ } else if (c <= 0x001fffff) { // 21 bits
APPEND_CHAR(uint32_t(0xf0 | ((c >> 18) & 0x07))); // Top 3 bits.
APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper middle 6 bits.
APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits.
APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
+ } else if (c <= 0x03ffffff) { // 26 bits
+ APPEND_CHAR(uint32_t(0xf8 | ((c >> 24) & 0x03))); // Top 2 bits.
+ APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Upper middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
+ } else if (c <= 0x7fffffff) { // 31 bits
+ APPEND_CHAR(uint32_t(0xfc | ((c >> 30) & 0x01))); // Top 1 bit.
+ APPEND_CHAR(uint32_t(0x80 | ((c >> 24) & 0x3f))); // Upper upper middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Lower upper middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper lower middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
+ } else {
+ APPEND_CHAR(0x20);
}
}
#undef APPEND_CHAR
@@ -1819,11 +1907,9 @@ String String::utf16(const char16_t *p_utf16, int p_len) {
return ret;
}
-bool String::parse_utf16(const char16_t *p_utf16, int p_len) {
-#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-16?");
-
+Error String::parse_utf16(const char16_t *p_utf16, int p_len) {
if (!p_utf16) {
- return true;
+ return ERR_INVALID_DATA;
}
String aux;
@@ -1850,80 +1936,90 @@ bool String::parse_utf16(const char16_t *p_utf16, int p_len) {
}
}
+ bool decode_error = false;
{
const char16_t *ptrtmp = p_utf16;
const char16_t *ptrtmp_limit = &p_utf16[p_len];
- int skip = 0;
+ uint32_t c_prev = 0;
+ bool skip = false;
while (ptrtmp != ptrtmp_limit && *ptrtmp) {
uint32_t c = (byteswap) ? BSWAP16(*ptrtmp) : *ptrtmp;
- if (skip == 0) {
- if ((c & 0xfffffc00) == 0xd800) {
- skip = 1; // lead surrogate
- } else if ((c & 0xfffffc00) == 0xdc00) {
- UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size));
- return true; // invalid UTF16
- } else {
- skip = 0;
+
+ if ((c & 0xfffffc00) == 0xd800) { // lead surrogate
+ if (skip) {
+ print_unicode_error(vformat("Unpaired lead surrogate (%x [trail?] %x)", c_prev, c));
+ decode_error = true;
}
- str_size++;
- } else {
- if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate
- --skip;
+ skip = true;
+ } else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate
+ if (skip) {
+ str_size--;
} else {
- UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size));
- return true; // invalid UTF16
+ print_unicode_error(vformat("Unpaired trail surrogate (%x [lead?] %x)", c_prev, c));
+ decode_error = true;
}
+ skip = false;
+ } else {
+ skip = false;
}
+ c_prev = c;
+ str_size++;
cstr_size++;
ptrtmp++;
}
if (skip) {
- UNICERROR("no space left");
- return true; // not enough space
+ print_unicode_error(vformat("Unpaired lead surrogate (%x [eol])", c_prev));
+ decode_error = true;
}
}
if (str_size == 0) {
clear();
- return false;
+ return OK; // empty string
}
resize(str_size + 1);
char32_t *dst = ptrw();
dst[str_size] = 0;
+ bool skip = false;
+ uint32_t c_prev = 0;
while (cstr_size) {
- int len = 0;
uint32_t c = (byteswap) ? BSWAP16(*p_utf16) : *p_utf16;
- if ((c & 0xfffffc00) == 0xd800) {
- len = 2;
+ if ((c & 0xfffffc00) == 0xd800) { // lead surrogate
+ if (skip) {
+ *(dst++) = c_prev; // unpaired, store as is
+ }
+ skip = true;
+ } else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate
+ if (skip) {
+ *(dst++) = (c_prev << 10UL) + c - ((0xd800 << 10UL) + 0xdc00 - 0x10000); // decode pair
+ } else {
+ *(dst++) = c; // unpaired, store as is
+ }
+ skip = false;
} else {
- len = 1;
+ *(dst++) = c;
+ skip = false;
}
- if (len > cstr_size) {
- UNICERROR("no space left");
- return true; //not enough space
- }
-
- uint32_t unichar = 0;
- if (len == 1) {
- unichar = c;
- } else {
- uint32_t c2 = (byteswap) ? BSWAP16(p_utf16[1]) : p_utf16[1];
- unichar = (c << 10UL) + c2 - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
- }
+ cstr_size--;
+ p_utf16++;
+ c_prev = c;
+ }
- *(dst++) = unichar;
- cstr_size -= len;
- p_utf16 += len;
+ if (skip) {
+ *(dst++) = c_prev;
}
- return false;
-#undef UNICERROR
+ if (decode_error) {
+ return ERR_PARSE_ERROR;
+ } else {
+ return OK;
+ }
}
Char16String String::utf16() const {
@@ -1938,15 +2034,14 @@ Char16String String::utf16() const {
uint32_t c = d[i];
if (c <= 0xffff) { // 16 bits.
fl += 1;
+ if ((c & 0xfffff800) == 0xd800) {
+ print_unicode_error(vformat("Unpaired surrogate (%x)", c));
+ }
} else if (c <= 0x10ffff) { // 32 bits.
fl += 2;
} else {
- print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
- return Char16String();
- }
- if (c >= 0xd800 && c <= 0xdfff) {
- print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
- return Char16String();
+ print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-16", c), true);
+ fl += 1;
}
}
@@ -1965,9 +2060,11 @@ Char16String String::utf16() const {
if (c <= 0xffff) { // 16 bits.
APPEND_CHAR(c);
- } else { // 32 bits.
+ } else if (c <= 0x10ffff) { // 32 bits.
APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate.
APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate.
+ } else {
+ APPEND_CHAR(0x20);
}
}
#undef APPEND_CHAR
diff --git a/core/string/ustring.h b/core/string/ustring.h
index 11d0974381..1b8bf3d234 100644
--- a/core/string/ustring.h
+++ b/core/string/ustring.h
@@ -271,6 +271,9 @@ public:
bool is_valid_string() const;
+ /* debug, error messages */
+ void print_unicode_error(const String &p_message, bool p_critical = false) const;
+
/* complex helpers */
String substr(int p_from, int p_chars = -1) const;
int find(const String &p_str, int p_from = 0) const; ///< return <0 if failed
@@ -373,11 +376,11 @@ public:
CharString ascii(bool p_allow_extended = false) const;
CharString utf8() const;
- bool parse_utf8(const char *p_utf8, int p_len = -1); //return true on error
+ Error parse_utf8(const char *p_utf8, int p_len = -1);
static String utf8(const char *p_utf8, int p_len = -1);
Char16String utf16() const;
- bool parse_utf16(const char16_t *p_utf16, int p_len = -1); //return true on error
+ Error parse_utf16(const char16_t *p_utf16, int p_len = -1);
static String utf16(const char16_t *p_utf16, int p_len = -1);
static uint32_t hash(const char32_t *p_cstr, int p_len); /* hash the string */