diff options
Diffstat (limited to 'core/string/ustring.cpp')
-rw-r--r-- | core/string/ustring.cpp | 777 |
1 files changed, 512 insertions, 265 deletions
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp index d189f3224b..4e26b61334 100644 --- a/core/string/ustring.cpp +++ b/core/string/ustring.cpp @@ -161,6 +161,18 @@ bool CharString::operator<(const CharString &p_right) const { return is_str_less(get_data(), p_right.get_data()); } +bool CharString::operator==(const CharString &p_right) const { + if (length() == 0) { + // True if both have length 0, false if only p_right has a length + return p_right.length() == 0; + } else if (p_right.length() == 0) { + // False due to unequal length + return false; + } + + return strcmp(ptr(), p_right.ptr()) == 0; +} + CharString &CharString::operator+=(char p_char) { const int lhs_len = length(); resize(lhs_len + 2); @@ -323,7 +335,13 @@ void String::copy_from(const char *p_cstr) { char32_t *dst = this->ptrw(); for (size_t i = 0; i <= len; i++) { - dst[i] = p_cstr[i]; + uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]); + if (c == 0 && i < len) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + } else { + dst[i] = c; + } } } @@ -350,7 +368,13 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) { char32_t *dst = this->ptrw(); for (int i = 0; i < len; i++) { - dst[i] = p_cstr[i]; + uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]); + if (c == 0) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + } else { + dst[i] = c; + } } dst[len] = 0; } @@ -376,14 +400,21 @@ void String::copy_from(const wchar_t *p_cstr, const int p_clip_to) { } void String::copy_from(const char32_t &p_char) { + if (p_char == 0) { + print_unicode_error("NUL character", true); + return; + } + if ((p_char & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char)); + } + if (p_char > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char)); + } + resize(2); + char32_t *dst = ptrw(); - if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + "."); - dst[0] = 0xfffd; - } else { - dst[0] = p_char; - } + dst[0] = p_char; dst[1] = 0; } @@ -437,12 +468,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) { dst[p_length] = 0; for (int i = 0; i < p_length; i++) { - if ((p_char[i] >= 0xd800 && p_char[i] <= 0xdfff) || (p_char[i] > 0x10ffff)) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char[i], 16) + "."); - dst[i] = 0xfffd; - } else { - dst[i] = p_char[i]; + if (p_char[i] == 0) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + continue; + } + if ((p_char[i] & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i])); } + if (p_char[i] > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i])); + } + dst[i] = p_char[i]; } } @@ -481,7 +518,7 @@ String operator+(const wchar_t *p_chr, const String &p_str) { // wchar_t is 16-bit String tmp = String::utf16((const char16_t *)p_chr); #else - // wchar_t is 32-bi + // wchar_t is 32-bit String tmp = (const char32_t *)p_chr; #endif tmp += p_str; @@ -506,10 +543,12 @@ String &String::operator+=(const String &p_str) { resize(lhs_len + rhs_len + 1); - const char32_t *src = p_str.get_data(); + const char32_t *src = p_str.ptr(); char32_t *dst = ptrw() + lhs_len; - memcpy(dst, src, (rhs_len + 1) * sizeof(char32_t)); + // Don't copy the terminating null with `memcpy` to avoid undefined behavior when string is being added to itself (it would overlap the destination). + memcpy(dst, src, rhs_len * sizeof(char32_t)); + *(dst + rhs_len) = _null; return *this; } @@ -527,7 +566,13 @@ String &String::operator+=(const char *p_str) { char32_t *dst = ptrw() + lhs_len; for (size_t i = 0; i <= rhs_len; i++) { - dst[i] = p_str[i]; + uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]); + if (c == 0 && i < rhs_len) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + } else { + dst[i] = c; + } } return *this; @@ -550,15 +595,21 @@ String &String::operator+=(const char32_t *p_str) { } String &String::operator+=(char32_t p_char) { + if (p_char == 0) { + print_unicode_error("NUL character", true); + return *this; + } + if ((p_char & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char)); + } + if (p_char > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char)); + } + const int lhs_len = length(); resize(lhs_len + 2); char32_t *dst = ptrw(); - if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + "."); - dst[lhs_len] = 0xfffd; - } else { - dst[lhs_len] = p_char; - } + dst[lhs_len] = p_char; dst[lhs_len + 1] = 0; return *this; @@ -931,62 +982,71 @@ const char32_t *String::get_data() const { return size() ? &operator[](0) : &zero; } -String String::capitalize() const { - String aux = this->camelcase_to_underscore(true).replace("_", " ").strip_edges(); - String cap; - for (int i = 0; i < aux.get_slice_count(" "); i++) { - String slice = aux.get_slicec(' ', i); - if (slice.length() > 0) { - slice[0] = _find_upper(slice[0]); - if (i > 0) { - cap += " "; - } - cap += slice; - } - } - - return cap; -} - -String String::camelcase_to_underscore(bool lowercase) const { +String String::_camelcase_to_underscore() const { const char32_t *cstr = get_data(); String new_string; int start_index = 0; for (int i = 1; i < this->size(); i++) { - bool is_upper = is_ascii_upper_case(cstr[i]); - bool is_number = is_digit(cstr[i]); - - bool are_next_2_lower = false; - bool is_next_lower = false; - bool is_next_number = false; - bool was_precedent_upper = is_ascii_upper_case(cstr[i - 1]); - bool was_precedent_number = is_digit(cstr[i - 1]); + bool is_prev_upper = is_ascii_upper_case(cstr[i - 1]); + bool is_prev_lower = is_ascii_lower_case(cstr[i - 1]); + bool is_prev_digit = is_digit(cstr[i - 1]); - if (i + 2 < this->size()) { - are_next_2_lower = is_ascii_lower_case(cstr[i + 1]) && is_ascii_lower_case(cstr[i + 2]); - } + bool is_curr_upper = is_ascii_upper_case(cstr[i]); + bool is_curr_lower = is_ascii_lower_case(cstr[i]); + bool is_curr_digit = is_digit(cstr[i]); + bool is_next_lower = false; if (i + 1 < this->size()) { is_next_lower = is_ascii_lower_case(cstr[i + 1]); - is_next_number = is_digit(cstr[i + 1]); } - const bool cond_a = is_upper && !was_precedent_upper && !was_precedent_number; - const bool cond_b = was_precedent_upper && is_upper && are_next_2_lower; - const bool cond_c = is_number && !was_precedent_number; - const bool can_break_number_letter = is_number && !was_precedent_number && is_next_lower; - const bool can_break_letter_number = !is_number && was_precedent_number && (is_next_lower || is_next_number); + const bool cond_a = is_prev_lower && is_curr_upper; // aA + const bool cond_b = (is_prev_upper || is_prev_digit) && is_curr_upper && is_next_lower; // AAa, 2Aa + const bool cond_c = is_prev_digit && is_curr_lower && is_next_lower; // 2aa + const bool cond_d = (is_prev_upper || is_prev_lower) && is_curr_digit; // A2, a2 - bool should_split = cond_a || cond_b || cond_c || can_break_number_letter || can_break_letter_number; - if (should_split) { + if (cond_a || cond_b || cond_c || cond_d) { new_string += this->substr(start_index, i - start_index) + "_"; start_index = i; } } new_string += this->substr(start_index, this->size() - start_index); - return lowercase ? new_string.to_lower() : new_string; + return new_string.to_lower(); +} + +String String::capitalize() const { + String aux = this->_camelcase_to_underscore().replace("_", " ").strip_edges(); + String cap; + for (int i = 0; i < aux.get_slice_count(" "); i++) { + String slice = aux.get_slicec(' ', i); + if (slice.length() > 0) { + slice[0] = _find_upper(slice[0]); + if (i > 0) { + cap += " "; + } + cap += slice; + } + } + + return cap; +} + +String String::to_camel_case() const { + String s = this->to_pascal_case(); + if (!s.is_empty()) { + s[0] = _find_lower(s[0]); + } + return s; +} + +String String::to_pascal_case() const { + return this->capitalize().replace(" ", ""); +} + +String String::to_snake_case() const { + return this->_camelcase_to_underscore().replace(" ", "_").strip_edges(); } String String::get_with_code_lines() const { @@ -1132,9 +1192,14 @@ Vector<String> String::split(const String &p_splitter, bool p_allow_empty, int p int len = length(); while (true) { - int end = find(p_splitter, from); - if (end < 0) { - end = len; + int end; + if (p_splitter.is_empty()) { + end = from + 1; + } else { + end = find(p_splitter, from); + if (end < 0) { + end = len; + } } if (p_allow_empty || (end > from)) { if (p_maxsplit <= 0) { @@ -1175,7 +1240,15 @@ Vector<String> String::rsplit(const String &p_splitter, bool p_allow_empty, int break; } - int left_edge = rfind(p_splitter, remaining_len - p_splitter.length()); + int left_edge; + if (p_splitter.is_empty()) { + left_edge = remaining_len - 1; + if (left_edge == 0) { + left_edge--; // Skip to the < 0 condition. + } + } else { + left_edge = rfind(p_splitter, remaining_len - p_splitter.length()); + } if (left_edge < 0) { // no more splitters, we're done @@ -1195,8 +1268,8 @@ Vector<String> String::rsplit(const String &p_splitter, bool p_allow_empty, int return ret; } -Vector<float> String::split_floats(const String &p_splitter, bool p_allow_empty) const { - Vector<float> ret; +Vector<double> String::split_floats(const String &p_splitter, bool p_allow_empty) const { + Vector<double> ret; int from = 0; int len = length(); @@ -1399,15 +1472,25 @@ String String::num(double p_num, int p_decimals) { fmt[5] = 'f'; fmt[6] = 0; } - char buf[256]; + // if we want to convert a double with as much decimal places as as + // DBL_MAX or DBL_MIN then we would theoretically need a buffer of at least + // DBL_MAX_10_EXP + 2 for DBL_MAX and DBL_MAX_10_EXP + 4 for DBL_MIN. + // BUT those values where still giving me exceptions, so I tested from + // DBL_MAX_10_EXP + 10 incrementing one by one and DBL_MAX_10_EXP + 17 (325) + // was the first buffer size not to throw an exception + char buf[325]; #if defined(__GNUC__) || defined(_MSC_VER) - snprintf(buf, 256, fmt, p_num); + // PLEASE NOTE that, albeit vcrt online reference states that snprintf + // should safely truncate the output to the given buffer size, we have + // found a case where this is not true, so we should create a buffer + // as big as needed + snprintf(buf, 325, fmt, p_num); #else sprintf(buf, fmt, p_num); #endif - buf[255] = 0; + buf[324] = 0; //destroy trailing zeroes { bool period = false; @@ -1519,10 +1602,11 @@ String String::num_real(double p_num, bool p_trailing) { #else int decimals = 6; #endif - // We want to align the digits to the above sane default, so we only - // need to subtract log10 for numbers with a positive power of ten. - if (p_num > 10) { - decimals -= (int)floor(log10(p_num)); + // We want to align the digits to the above sane default, so we only need + // to subtract log10 for numbers with a positive power of ten magnitude. + double abs_num = Math::abs(p_num); + if (abs_num > 10) { + decimals -= (int)floor(log10(abs_num)); } return num(p_num, decimals); } @@ -1583,6 +1667,14 @@ String String::hex_encode_buffer(const uint8_t *p_buffer, int p_len) { return ret; } +void String::print_unicode_error(const String &p_message, bool p_critical) const { + if (p_critical) { + print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message)); + } else { + print_error(vformat("Unicode parsing error: %s", p_message)); + } +} + CharString String::ascii(bool p_allow_extended) const { if (!length()) { return CharString(); @@ -1596,7 +1688,7 @@ CharString String::ascii(bool p_allow_extended) const { if ((c <= 0x7f) || (c <= 0xff && p_allow_extended)) { cs[i] = c; } else { - print_error("Unicode parsing error: Cannot represent " + num_int64(c, 16) + " as ASCII/Latin-1 character."); + print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c)); cs[i] = 0x20; } } @@ -1611,11 +1703,9 @@ String String::utf8(const char *p_utf8, int p_len) { return ret; } -bool String::parse_utf8(const char *p_utf8, int p_len) { -#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-8?"); - +Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) { if (!p_utf8) { - return true; + return ERR_INVALID_DATA; } String aux; @@ -1635,14 +1725,21 @@ bool String::parse_utf8(const char *p_utf8, int p_len) { } } + bool decode_error = false; + bool decode_failed = false; { const char *ptrtmp = p_utf8; const char *ptrtmp_limit = &p_utf8[p_len]; int skip = 0; + uint8_t c_start = 0; while (ptrtmp != ptrtmp_limit && *ptrtmp) { - if (skip == 0) { - uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp); + uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp); + if (skip == 0) { + if (p_skip_cr && c == '\r') { + ptrtmp++; + continue; + } /* Determine the number of characters in sequence */ if ((c & 0x80) == 0) { skip = 0; @@ -1652,20 +1749,34 @@ bool String::parse_utf8(const char *p_utf8, int p_len) { skip = 2; } else if ((c & 0xf8) == 0xf0) { skip = 3; + } else if ((c & 0xfc) == 0xf8) { + skip = 4; + } else if ((c & 0xfe) == 0xfc) { + skip = 5; } else { - UNICERROR("invalid skip at " + num_int64(cstr_size)); - return true; //invalid utf8 + skip = 0; + print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true); + decode_failed = true; } + c_start = c; if (skip == 1 && (c & 0x1e) == 0) { - UNICERROR("overlong rejected at " + num_int64(cstr_size)); - return true; //reject overlong + print_unicode_error(vformat("Overlong encoding (%x ...)", c)); + decode_error = true; } - str_size++; - } else { - --skip; + if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) { + print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c)); + decode_error = true; + } + if (c < 0x80 || c > 0xbf) { + print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true); + decode_failed = true; + skip = 0; + } else { + --skip; + } } cstr_size++; @@ -1673,80 +1784,95 @@ bool String::parse_utf8(const char *p_utf8, int p_len) { } if (skip) { - UNICERROR("no space left"); - return true; //not enough space + print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true); + decode_failed = true; } } if (str_size == 0) { clear(); - return false; + return OK; // empty string } resize(str_size + 1); char32_t *dst = ptrw(); dst[str_size] = 0; + int skip = 0; + uint32_t unichar = 0; while (cstr_size) { - int len = 0; - - /* Determine the number of characters in sequence */ - if ((*p_utf8 & 0x80) == 0) { - len = 1; - } else if ((*p_utf8 & 0xe0) == 0xc0) { - len = 2; - } else if ((*p_utf8 & 0xf0) == 0xe0) { - len = 3; - } else if ((*p_utf8 & 0xf8) == 0xf0) { - len = 4; - } else { - UNICERROR("invalid len"); - return true; //invalid UTF8 - } - - if (len > cstr_size) { - UNICERROR("no space left"); - return true; //not enough space - } - - if (len == 2 && (*p_utf8 & 0x1E) == 0) { - UNICERROR("no space left"); - return true; //reject overlong - } - - /* Convert the first character */ - - uint32_t unichar = 0; + uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8); - if (len == 1) { - unichar = *p_utf8; + if (skip == 0) { + if (p_skip_cr && c == '\r') { + p_utf8++; + continue; + } + /* Determine the number of characters in sequence */ + if ((c & 0x80) == 0) { + *(dst++) = c; + unichar = 0; + skip = 0; + } else if ((c & 0xe0) == 0xc0) { + unichar = (0xff >> 3) & c; + skip = 1; + } else if ((c & 0xf0) == 0xe0) { + unichar = (0xff >> 4) & c; + skip = 2; + } else if ((c & 0xf8) == 0xf0) { + unichar = (0xff >> 5) & c; + skip = 3; + } else if ((c & 0xfc) == 0xf8) { + unichar = (0xff >> 6) & c; + skip = 4; + } else if ((c & 0xfe) == 0xfc) { + unichar = (0xff >> 7) & c; + skip = 5; + } else { + *(dst++) = 0x20; + unichar = 0; + skip = 0; + } } else { - unichar = (0xff >> (len + 1)) & *p_utf8; - - for (int i = 1; i < len; i++) { - if ((p_utf8[i] & 0xc0) != 0x80) { - UNICERROR("invalid utf8"); - return true; //invalid utf8 - } - if (unichar == 0 && i == 2 && ((p_utf8[i] & 0x7f) >> (7 - len)) == 0) { - UNICERROR("invalid utf8 overlong"); - return true; //no overlong + if (c < 0x80 || c > 0xbf) { + *(dst++) = 0x20; + skip = 0; + } else { + unichar = (unichar << 6) | (c & 0x3f); + --skip; + if (skip == 0) { + if (unichar == 0) { + print_unicode_error("NUL character", true); + decode_failed = true; + unichar = 0x20; + } + if ((unichar & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", unichar)); + decode_error = true; + } + if (unichar > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar)); + decode_error = true; + } + *(dst++) = unichar; } - unichar = (unichar << 6) | (p_utf8[i] & 0x3f); } } - if (unichar >= 0xd800 && unichar <= 0xdfff) { - UNICERROR("invalid code point"); - return CharString(); - } - *(dst++) = unichar; - cstr_size -= len; - p_utf8 += len; + cstr_size--; + p_utf8++; + } + if (skip) { + *(dst++) = 0x20; } - return false; -#undef UNICERROR + if (decode_failed) { + return ERR_INVALID_DATA; + } else if (decode_error) { + return ERR_PARSE_ERROR; + } else { + return OK; + } } CharString String::utf8() const { @@ -1765,15 +1891,17 @@ CharString String::utf8() const { fl += 2; } else if (c <= 0xffff) { // 16 bits fl += 3; - } else if (c <= 0x0010ffff) { // 21 bits + } else if (c <= 0x001fffff) { // 21 bits fl += 4; + } else if (c <= 0x03ffffff) { // 26 bits + fl += 5; + print_unicode_error(vformat("Invalid unicode codepoint (%x)", c)); + } else if (c <= 0x7fffffff) { // 31 bits + fl += 6; + print_unicode_error(vformat("Invalid unicode codepoint (%x)", c)); } else { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return CharString(); - } - if (c >= 0xd800 && c <= 0xdfff) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return CharString(); + fl += 1; + print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-8", c), true); } } @@ -1799,11 +1927,26 @@ CharString String::utf8() const { APPEND_CHAR(uint32_t(0xe0 | ((c >> 12) & 0x0f))); // Top 4 bits. APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Middle 6 bits. APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. - } else { // 21 bits + } else if (c <= 0x001fffff) { // 21 bits APPEND_CHAR(uint32_t(0xf0 | ((c >> 18) & 0x07))); // Top 3 bits. APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper middle 6 bits. APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits. APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. + } else if (c <= 0x03ffffff) { // 26 bits + APPEND_CHAR(uint32_t(0xf8 | ((c >> 24) & 0x03))); // Top 2 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Upper middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. + } else if (c <= 0x7fffffff) { // 31 bits + APPEND_CHAR(uint32_t(0xfc | ((c >> 30) & 0x01))); // Top 1 bit. + APPEND_CHAR(uint32_t(0x80 | ((c >> 24) & 0x3f))); // Upper upper middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Lower upper middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper lower middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. + } else { + APPEND_CHAR(0x20); } } #undef APPEND_CHAR @@ -1819,11 +1962,9 @@ String String::utf16(const char16_t *p_utf16, int p_len) { return ret; } -bool String::parse_utf16(const char16_t *p_utf16, int p_len) { -#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-16?"); - +Error String::parse_utf16(const char16_t *p_utf16, int p_len) { if (!p_utf16) { - return true; + return ERR_INVALID_DATA; } String aux; @@ -1850,80 +1991,90 @@ bool String::parse_utf16(const char16_t *p_utf16, int p_len) { } } + bool decode_error = false; { const char16_t *ptrtmp = p_utf16; const char16_t *ptrtmp_limit = &p_utf16[p_len]; - int skip = 0; + uint32_t c_prev = 0; + bool skip = false; while (ptrtmp != ptrtmp_limit && *ptrtmp) { uint32_t c = (byteswap) ? BSWAP16(*ptrtmp) : *ptrtmp; - if (skip == 0) { - if ((c & 0xfffffc00) == 0xd800) { - skip = 1; // lead surrogate - } else if ((c & 0xfffffc00) == 0xdc00) { - UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size)); - return true; // invalid UTF16 - } else { - skip = 0; + + if ((c & 0xfffffc00) == 0xd800) { // lead surrogate + if (skip) { + print_unicode_error(vformat("Unpaired lead surrogate (%x [trail?] %x)", c_prev, c)); + decode_error = true; } - str_size++; - } else { - if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate - --skip; + skip = true; + } else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate + if (skip) { + str_size--; } else { - UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size)); - return true; // invalid UTF16 + print_unicode_error(vformat("Unpaired trail surrogate (%x [lead?] %x)", c_prev, c)); + decode_error = true; } + skip = false; + } else { + skip = false; } + c_prev = c; + str_size++; cstr_size++; ptrtmp++; } if (skip) { - UNICERROR("no space left"); - return true; // not enough space + print_unicode_error(vformat("Unpaired lead surrogate (%x [eol])", c_prev)); + decode_error = true; } } if (str_size == 0) { clear(); - return false; + return OK; // empty string } resize(str_size + 1); char32_t *dst = ptrw(); dst[str_size] = 0; + bool skip = false; + uint32_t c_prev = 0; while (cstr_size) { - int len = 0; uint32_t c = (byteswap) ? BSWAP16(*p_utf16) : *p_utf16; - if ((c & 0xfffffc00) == 0xd800) { - len = 2; + if ((c & 0xfffffc00) == 0xd800) { // lead surrogate + if (skip) { + *(dst++) = c_prev; // unpaired, store as is + } + skip = true; + } else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate + if (skip) { + *(dst++) = (c_prev << 10UL) + c - ((0xd800 << 10UL) + 0xdc00 - 0x10000); // decode pair + } else { + *(dst++) = c; // unpaired, store as is + } + skip = false; } else { - len = 1; + *(dst++) = c; + skip = false; } - if (len > cstr_size) { - UNICERROR("no space left"); - return true; //not enough space - } - - uint32_t unichar = 0; - if (len == 1) { - unichar = c; - } else { - uint32_t c2 = (byteswap) ? BSWAP16(p_utf16[1]) : p_utf16[1]; - unichar = (c << 10UL) + c2 - ((0xd800 << 10UL) + 0xdc00 - 0x10000); - } + cstr_size--; + p_utf16++; + c_prev = c; + } - *(dst++) = unichar; - cstr_size -= len; - p_utf16 += len; + if (skip) { + *(dst++) = c_prev; } - return false; -#undef UNICERROR + if (decode_error) { + return ERR_PARSE_ERROR; + } else { + return OK; + } } Char16String String::utf16() const { @@ -1938,15 +2089,14 @@ Char16String String::utf16() const { uint32_t c = d[i]; if (c <= 0xffff) { // 16 bits. fl += 1; + if ((c & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", c)); + } } else if (c <= 0x10ffff) { // 32 bits. fl += 2; } else { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return Char16String(); - } - if (c >= 0xd800 && c <= 0xdfff) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return Char16String(); + print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-16", c), true); + fl += 1; } } @@ -1965,9 +2115,11 @@ Char16String String::utf16() const { if (c <= 0xffff) { // 16 bits. APPEND_CHAR(c); - } else { // 32 bits. + } else if (c <= 0x10ffff) { // 32 bits. APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate. APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate. + } else { + APPEND_CHAR(0x20); } } #undef APPEND_CHAR @@ -2508,10 +2660,11 @@ double String::to_float() const { uint32_t String::hash(const char *p_cstr) { uint32_t hashv = 5381; - uint32_t c; + uint32_t c = *p_cstr++; - while ((c = *p_cstr++)) { + while (c) { hashv = ((hashv << 5) + hashv) + c; /* hash * 33 + c */ + c = *p_cstr++; } return hashv; @@ -2537,10 +2690,11 @@ uint32_t String::hash(const wchar_t *p_cstr, int p_len) { uint32_t String::hash(const wchar_t *p_cstr) { uint32_t hashv = 5381; - uint32_t c; + uint32_t c = *p_cstr++; - while ((c = *p_cstr++)) { + while (c) { hashv = ((hashv << 5) + hashv) + c; /* hash * 33 + c */ + c = *p_cstr++; } return hashv; @@ -2557,10 +2711,11 @@ uint32_t String::hash(const char32_t *p_cstr, int p_len) { uint32_t String::hash(const char32_t *p_cstr) { uint32_t hashv = 5381; - uint32_t c; + uint32_t c = *p_cstr++; - while ((c = *p_cstr++)) { + while (c) { hashv = ((hashv << 5) + hashv) + c; /* hash * 33 + c */ + c = *p_cstr++; } return hashv; @@ -2571,10 +2726,11 @@ uint32_t String::hash() const { const char32_t *chr = get_data(); uint32_t hashv = 5381; - uint32_t c; + uint32_t c = *chr++; - while ((c = *chr++)) { + while (c) { hashv = ((hashv << 5) + hashv) + c; /* hash * 33 + c */ + c = *chr++; } return hashv; @@ -2585,10 +2741,11 @@ uint64_t String::hash64() const { const char32_t *chr = get_data(); uint64_t hashv = 5381; - uint64_t c; + uint64_t c = *chr++; - while ((c = *chr++)) { + while (c) { hashv = ((hashv << 5) + hashv) + c; /* hash * 33 + c */ + c = *chr++; } return hashv; @@ -2694,7 +2851,7 @@ String String::substr(int p_from, int p_chars) const { return String(*this); } - String s = String(); + String s; s.copy_from_unchecked(&get_data()[p_from], p_chars); return s; } @@ -3344,51 +3501,52 @@ String String::replacen(const String &p_key, const String &p_with) const { String String::repeat(int p_count) const { ERR_FAIL_COND_V_MSG(p_count < 0, "", "Parameter count should be a positive number."); - String new_string; - const char32_t *src = this->get_data(); - - new_string.resize(length() * p_count + 1); - new_string[length() * p_count] = 0; - - for (int i = 0; i < p_count; i++) { - for (int j = 0; j < length(); j++) { - new_string[i * length() + j] = src[j]; - } - } - + int len = length(); + String new_string = *this; + new_string.resize(p_count * len + 1); + + char32_t *dst = new_string.ptrw(); + int offset = 1; + int stride = 1; + while (offset < p_count) { + memcpy(dst + offset * len, dst, stride * len * sizeof(char32_t)); + offset += stride; + stride = MIN(stride * 2, p_count - offset); + } + dst[p_count * len] = _null; return new_string; } -String String::left(int p_pos) const { - if (p_pos < 0) { - p_pos = length() + p_pos; +String String::left(int p_len) const { + if (p_len < 0) { + p_len = length() + p_len; } - if (p_pos <= 0) { + if (p_len <= 0) { return ""; } - if (p_pos >= length()) { + if (p_len >= length()) { return *this; } - return substr(0, p_pos); + return substr(0, p_len); } -String String::right(int p_pos) const { - if (p_pos < 0) { - p_pos = length() + p_pos; +String String::right(int p_len) const { + if (p_len < 0) { + p_len = length() + p_len; } - if (p_pos <= 0) { + if (p_len <= 0) { return ""; } - if (p_pos >= length()) { + if (p_len >= length()) { return *this; } - return substr(length() - p_pos); + return substr(length() - p_len); } char32_t String::unicode_at(int p_idx) const { @@ -3545,29 +3703,43 @@ bool String::is_network_share_path() const { String String::simplify_path() const { String s = *this; String drive; - if (s.begins_with("local://")) { - drive = "local://"; - s = s.substr(8, s.length()); - } else if (s.begins_with("res://")) { - drive = "res://"; - s = s.substr(6, s.length()); - } else if (s.begins_with("user://")) { - drive = "user://"; - s = s.substr(7, s.length()); - } else if (is_network_share_path()) { - drive = s.substr(0, 2); - s = s.substr(2, s.length() - 2); - } else if (s.begins_with("/") || s.begins_with("\\")) { - drive = s.substr(0, 1); - s = s.substr(1, s.length() - 1); - } else { - int p = s.find(":/"); - if (p == -1) { - p = s.find(":\\"); + + // Check if we have a special path (like res://) or a protocol identifier. + int p = s.find("://"); + bool found = false; + if (p > 0) { + bool only_chars = true; + for (int i = 0; i < p; i++) { + if (!is_ascii_alphanumeric_char(s[i])) { + only_chars = false; + break; + } } - if (p != -1 && p < s.find("/")) { - drive = s.substr(0, p + 2); - s = s.substr(p + 2, s.length()); + if (only_chars) { + found = true; + drive = s.substr(0, p + 3); + s = s.substr(p + 3); + } + } + if (!found) { + if (is_network_share_path()) { + // Network path, beginning with // or \\. + drive = s.substr(0, 2); + s = s.substr(2); + } else if (s.begins_with("/") || s.begins_with("\\")) { + // Absolute path. + drive = s.substr(0, 1); + s = s.substr(1); + } else { + // Windows-style drive path, like C:/ or C:\. + p = s.find(":/"); + if (p == -1) { + p = s.find(":\\"); + } + if (p != -1 && p < s.find("/")) { + drive = s.substr(0, p + 2); + s = s.substr(p + 2); + } } } @@ -3762,7 +3934,6 @@ String String::c_unescape() const { escaped = escaped.replace("\\v", "\v"); escaped = escaped.replace("\\'", "\'"); escaped = escaped.replace("\\\"", "\""); - escaped = escaped.replace("\\?", "\?"); escaped = escaped.replace("\\\\", "\\"); return escaped; @@ -3779,7 +3950,6 @@ String String::c_escape() const { escaped = escaped.replace("\t", "\\t"); escaped = escaped.replace("\v", "\\v"); escaped = escaped.replace("\'", "\\'"); - escaped = escaped.replace("\?", "\\?"); escaped = escaped.replace("\"", "\\\""); return escaped; @@ -4343,7 +4513,7 @@ String String::get_extension() const { return substr(pos + 1, length()); } -String String::plus_file(const String &p_file) const { +String String::path_join(const String &p_file) const { if (is_empty()) { return p_file; } @@ -4529,15 +4699,18 @@ String String::sprintf(const Array &values, bool *error) const { double value = values[value_index]; bool is_negative = (value < 0); String str = String::num(ABS(value), min_decimals); + const bool is_finite = Math::is_finite(value); // Pad decimals out. - str = str.pad_decimals(min_decimals); + if (is_finite) { + str = str.pad_decimals(min_decimals); + } int initial_len = str.length(); // Padding. Leave room for sign later if required. int pad_chars_count = (is_negative || show_sign) ? min_chars - 1 : min_chars; - String pad_char = pad_with_zeros ? String("0") : String(" "); + String pad_char = (pad_with_zeros && is_finite) ? String("0") : String(" "); // Never pad NaN or inf with zeros if (left_justified) { str = str.rpad(pad_chars_count, pad_char); } else { @@ -4559,6 +4732,76 @@ String String::sprintf(const Array &values, bool *error) const { in_format = false; break; } + case 'v': { // Vector2/3/4/2i/3i/4i + if (value_index >= values.size()) { + return "not enough arguments for format string"; + } + + int count; + switch (values[value_index].get_type()) { + case Variant::VECTOR2: + case Variant::VECTOR2I: { + count = 2; + } break; + case Variant::VECTOR3: + case Variant::VECTOR3I: { + count = 3; + } break; + case Variant::VECTOR4: + case Variant::VECTOR4I: { + count = 4; + } break; + default: { + return "%v requires a vector type (Vector2/3/4/2i/3i/4i)"; + } + } + + Vector4 vec = values[value_index]; + String str = "("; + for (int i = 0; i < count; i++) { + double val = vec[i]; + String number_str = String::num(ABS(val), min_decimals); + const bool is_finite = Math::is_finite(val); + + // Pad decimals out. + if (is_finite) { + number_str = number_str.pad_decimals(min_decimals); + } + + int initial_len = number_str.length(); + + // Padding. Leave room for sign later if required. + int pad_chars_count = val < 0 ? min_chars - 1 : min_chars; + String pad_char = (pad_with_zeros && is_finite) ? String("0") : String(" "); // Never pad NaN or inf with zeros + if (left_justified) { + number_str = number_str.rpad(pad_chars_count, pad_char); + } else { + number_str = number_str.lpad(pad_chars_count, pad_char); + } + + // Add sign if needed. + if (val < 0) { + if (left_justified) { + number_str = number_str.insert(0, "-"); + } else { + number_str = number_str.insert(pad_with_zeros ? 0 : number_str.length() - initial_len, "-"); + } + } + + // Add number to combined string + str += number_str; + + if (i < count - 1) { + str += ", "; + } + } + str += ")"; + + formatted += str; + ++value_index; + in_format = false; + break; + } case 's': { // String if (value_index >= values.size()) { return "not enough arguments for format string"; @@ -4651,7 +4894,7 @@ String String::sprintf(const Array &values, bool *error) const { } break; } - case '.': { // Float separator. + case '.': { // Float/Vector separator. if (in_decimals) { return "too many decimal points in format"; } @@ -4665,8 +4908,12 @@ String String::sprintf(const Array &values, bool *error) const { return "not enough arguments for format string"; } - if (!values[value_index].is_num()) { - return "* wants number"; + Variant::Type value_type = values[value_index].get_type(); + if (!values[value_index].is_num() && + value_type != Variant::VECTOR2 && value_type != Variant::VECTOR2I && + value_type != Variant::VECTOR3 && value_type != Variant::VECTOR3I && + value_type != Variant::VECTOR4 && value_type != Variant::VECTOR4I) { + return "* wants number or vector"; } int size = values[value_index]; |