1 files changed, 439 insertions, 239 deletions
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index 015dfbc651..d8b93998af 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -323,7 +323,13 @@ void String::copy_from(const char *p_cstr) {
 	char32_t *dst = this->ptrw();
 
 	for (size_t i = 0; i <= len; i++) {
-		dst[i] = p_cstr[i];
+		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
+		if (c == 0 && i < len) {
+			print_unicode_error("NUL character", true);
+			dst[i] = 0x20;
+		} else {
+			dst[i] = c;
+		}
 	}
 }
 
@@ -350,7 +356,13 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) {
 	char32_t *dst = this->ptrw();
 
 	for (int i = 0; i < len; i++) {
-		dst[i] = p_cstr[i];
+		uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
+		if (c == 0) {
+			print_unicode_error("NUL character", true);
+			dst[i] = 0x20;
+		} else {
+			dst[i] = c;
+		}
 	}
 	dst[len] = 0;
 }
@@ -376,14 +388,21 @@ void String::copy_from(const wchar_t *p_cstr, const int p_clip_to) {
 }
 
 void String::copy_from(const char32_t &p_char) {
+	if (p_char == 0) {
+		print_unicode_error("NUL character", true);
+		return;
+	}
+	if ((p_char & 0xfffff800) == 0xd800) {
+		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
+	}
+	if (p_char > 0x10ffff) {
+		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
+	}
+
 	resize(2);
+
 	char32_t *dst = ptrw();
-	if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) {
-		print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + ".");
-		dst[0] = 0xfffd;
-	} else {
-		dst[0] = p_char;
-	}
+	dst[0] = p_char;
 	dst[1] = 0;
 }
 
@@ -437,12 +456,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) {
 	dst[p_length] = 0;
 
 	for (int i = 0; i < p_length; i++) {
-		if ((p_char[i] >= 0xd800 && p_char[i] <= 0xdfff) || (p_char[i] > 0x10ffff)) {
-			print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char[i], 16) + ".");
-			dst[i] = 0xfffd;
-		} else {
-			dst[i] = p_char[i];
+		if (p_char[i] == 0) {
+			print_unicode_error("NUL character", true);
+			dst[i] = 0x20;
+			continue;
+		}
+		if ((p_char[i] & 0xfffff800) == 0xd800) {
+			print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i]));
 		}
+		if (p_char[i] > 0x10ffff) {
+			print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i]));
+		}
+		dst[i] = p_char[i];
 	}
 }
 
@@ -481,7 +506,7 @@ String operator+(const wchar_t *p_chr, const String &p_str) {
 	// wchar_t is 16-bit
 	String tmp = String::utf16((const char16_t *)p_chr);
 #else
-	// wchar_t is 32-bi
+	// wchar_t is 32-bit
 	String tmp = (const char32_t *)p_chr;
 #endif
 	tmp += p_str;
@@ -506,10 +531,12 @@ String &String::operator+=(const String &p_str) {
 
 	resize(lhs_len + rhs_len + 1);
 
-	const char32_t *src = p_str.get_data();
+	const char32_t *src = p_str.ptr();
 	char32_t *dst = ptrw() + lhs_len;
 
-	memcpy(dst, src, (rhs_len + 1) * sizeof(char32_t));
+	// Don't copy the terminating null with `memcpy` to avoid undefined behavior when string is being added to itself (it would overlap the destination).
+	memcpy(dst, src, rhs_len * sizeof(char32_t));
+	*(dst + rhs_len) = _null;
 
 	return *this;
 }
@@ -527,7 +554,13 @@ String &String::operator+=(const char *p_str) {
 	char32_t *dst = ptrw() + lhs_len;
 
 	for (size_t i = 0; i <= rhs_len; i++) {
-		dst[i] = p_str[i];
+		uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]);
+		if (c == 0 && i < rhs_len) {
+			print_unicode_error("NUL character", true);
+			dst[i] = 0x20;
+		} else {
+			dst[i] = c;
+		}
 	}
 
 	return *this;
@@ -550,15 +583,21 @@ String &String::operator+=(const char32_t *p_str) {
 }
 
 String &String::operator+=(char32_t p_char) {
+	if (p_char == 0) {
+		print_unicode_error("NUL character", true);
+		return *this;
+	}
+	if ((p_char & 0xfffff800) == 0xd800) {
+		print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
+	}
+	if (p_char > 0x10ffff) {
+		print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
+	}
+
 	const int lhs_len = length();
 	resize(lhs_len + 2);
 	char32_t *dst = ptrw();
-	if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) {
-		print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + ".");
-		dst[lhs_len] = 0xfffd;
-	} else {
-		dst[lhs_len] = p_char;
-	}
+	dst[lhs_len] = p_char;
 	dst[lhs_len + 1] = 0;
 
 	return *this;
@@ -931,62 +970,71 @@ const char32_t *String::get_data() const {
 	return size() ? &operator[](0) : &zero;
 }
 
-String String::capitalize() const {
-	String aux = this->camelcase_to_underscore(true).replace("_", " ").strip_edges();
-	String cap;
-	for (int i = 0; i < aux.get_slice_count(" "); i++) {
-		String slice = aux.get_slicec(' ', i);
-		if (slice.length() > 0) {
-			slice[0] = _find_upper(slice[0]);
-			if (i > 0) {
-				cap += " ";
-			}
-			cap += slice;
-		}
-	}
-
-	return cap;
-}
-
-String String::camelcase_to_underscore(bool lowercase) const {
+String String::_camelcase_to_underscore() const {
 	const char32_t *cstr = get_data();
 	String new_string;
 	int start_index = 0;
 
 	for (int i = 1; i < this->size(); i++) {
-		bool is_upper = is_ascii_upper_case(cstr[i]);
-		bool is_number = is_digit(cstr[i]);
+		bool is_prev_upper = is_ascii_upper_case(cstr[i - 1]);
+		bool is_prev_lower = is_ascii_lower_case(cstr[i - 1]);
+		bool is_prev_digit = is_digit(cstr[i - 1]);
 
-		bool are_next_2_lower = false;
-		bool is_next_lower = false;
-		bool is_next_number = false;
-		bool was_precedent_upper = is_ascii_upper_case(cstr[i - 1]);
-		bool was_precedent_number = is_digit(cstr[i - 1]);
-
-		if (i + 2 < this->size()) {
-			are_next_2_lower = is_ascii_lower_case(cstr[i + 1]) && is_ascii_lower_case(cstr[i + 2]);
-		}
+		bool is_curr_upper = is_ascii_upper_case(cstr[i]);
+		bool is_curr_lower = is_ascii_lower_case(cstr[i]);
+		bool is_curr_digit = is_digit(cstr[i]);
 
+		bool is_next_lower = false;
 		if (i + 1 < this->size()) {
 			is_next_lower = is_ascii_lower_case(cstr[i + 1]);
-			is_next_number = is_digit(cstr[i + 1]);
 		}
 
-		const bool cond_a = is_upper && !was_precedent_upper && !was_precedent_number;
-		const bool cond_b = was_precedent_upper && is_upper && are_next_2_lower;
-		const bool cond_c = is_number && !was_precedent_number;
-		const bool can_break_number_letter = is_number && !was_precedent_number && is_next_lower;
-		const bool can_break_letter_number = !is_number && was_precedent_number && (is_next_lower || is_next_number);
+		const bool cond_a = is_prev_lower && is_curr_upper; // aA
+		const bool cond_b = (is_prev_upper || is_prev_digit) && is_curr_upper && is_next_lower; // AAa, 2Aa
+		const bool cond_c = is_prev_digit && is_curr_lower && is_next_lower; // 2aa
+		const bool cond_d = (is_prev_upper || is_prev_lower) && is_curr_digit; // A2, a2
 
-		bool should_split = cond_a || cond_b || cond_c || can_break_number_letter || can_break_letter_number;
-		if (should_split) {
+		if (cond_a || cond_b || cond_c || cond_d) {
 			new_string += this->substr(start_index, i - start_index) + "_";
 			start_index = i;
 		}
 	}
 
 	new_string += this->substr(start_index, this->size() - start_index);
-	return lowercase ? new_string.to_lower() : new_string;
+	return new_string.to_lower();
+}
+
+String String::capitalize() const {
+	String aux = this->_camelcase_to_underscore().replace("_", " ").strip_edges();
+	String cap;
+	for (int i = 0; i < aux.get_slice_count(" "); i++) {
+		String slice = aux.get_slicec(' ', i);
+		if (slice.length() > 0) {
+			slice[0] = _find_upper(slice[0]);
+			if (i > 0) {
+				cap += " ";
+			}
+			cap += slice;
+		}
+	}
+
+	return cap;
+}
+
+String String::to_camel_case() const {
+	String s = this->to_pascal_case();
+	if (!s.is_empty()) {
+		s[0] = _find_lower(s[0]);
+	}
+	return s;
+}
+
+String String::to_pascal_case() const {
+	return this->capitalize().replace(" ", "");
+}
+
+String String::to_snake_case() const {
+	return this->_camelcase_to_underscore().replace(" ", "_").strip_edges();
 }
 
 String String::get_with_code_lines() const {
@@ -1583,6 +1631,14 @@ String String::hex_encode_buffer(const uint8_t *p_buffer, int p_len) {
 	return ret;
 }
 
+void String::print_unicode_error(const String &p_message, bool p_critical) const {
+	if (p_critical) {
+		print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message));
+	} else {
+		print_error(vformat("Unicode parsing error: %s", p_message));
+	}
+}
+
 CharString String::ascii(bool p_allow_extended) const {
 	if (!length()) {
 		return CharString();
@@ -1596,7 +1652,7 @@ CharString String::ascii(bool p_allow_extended) const {
 		if ((c <= 0x7f) || (c <= 0xff && p_allow_extended)) {
 			cs[i] = c;
 		} else {
-			print_error("Unicode parsing error: Cannot represent " + num_int64(c, 16) + " as ASCII/Latin-1 character.");
+			print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c));
 			cs[i] = 0x20;
 		}
 	}
@@ -1611,11 +1667,9 @@ String String::utf8(const char *p_utf8, int p_len) {
 	return ret;
 }
 
-bool String::parse_utf8(const char *p_utf8, int p_len) {
-#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-8?");
-
+Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
 	if (!p_utf8) {
-		return true;
+		return ERR_INVALID_DATA;
 	}
 
 	String aux;
@@ -1635,14 +1689,21 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
 		}
 	}
 
+	bool decode_error = false;
+	bool decode_failed = false;
 	{
 		const char *ptrtmp = p_utf8;
 		const char *ptrtmp_limit = &p_utf8[p_len];
 		int skip = 0;
+		uint8_t c_start = 0;
 		while (ptrtmp != ptrtmp_limit && *ptrtmp) {
-			if (skip == 0) {
-				uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
+			uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp);
 
+			if (skip == 0) {
+				if (p_skip_cr && c == '\r') {
+					ptrtmp++;
+					continue;
+				}
 				/* Determine the number of characters in sequence */
 				if ((c & 0x80) == 0) {
 					skip = 0;
@@ -1652,20 +1713,34 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
 					skip = 2;
 				} else if ((c & 0xf8) == 0xf0) {
 					skip = 3;
+				} else if ((c & 0xfc) == 0xf8) {
+					skip = 4;
+				} else if ((c & 0xfe) == 0xfc) {
+					skip = 5;
 				} else {
-					UNICERROR("invalid skip at " + num_int64(cstr_size));
-					return true; //invalid utf8
+					skip = 0;
+					print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true);
+					decode_failed = true;
 				}
+				c_start = c;
 
 				if (skip == 1 && (c & 0x1e) == 0) {
-					UNICERROR("overlong rejected at " + num_int64(cstr_size));
-					return true; //reject overlong
+					print_unicode_error(vformat("Overlong encoding (%x ...)", c));
+					decode_error = true;
 				}
-
 				str_size++;
-
 			} else {
-				--skip;
+				if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) {
+					print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c));
+					decode_error = true;
+				}
+				if (c < 0x80 || c > 0xbf) {
+					print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true);
+					decode_failed = true;
+					skip = 0;
+				} else {
+					--skip;
+				}
 			}
 
 			cstr_size++;
@@ -1673,80 +1748,95 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
 		}
 
 		if (skip) {
-			UNICERROR("no space left");
-			return true; //not enough space
+			print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true);
+			decode_failed = true;
 		}
 	}
 
 	if (str_size == 0) {
 		clear();
-		return false;
+		return OK; // empty string
 	}
 
 	resize(str_size + 1);
 	char32_t *dst = ptrw();
 	dst[str_size] = 0;
 
+	int skip = 0;
+	uint32_t unichar = 0;
 	while (cstr_size) {
-		int len = 0;
-
-		/* Determine the number of characters in sequence */
-		if ((*p_utf8 & 0x80) == 0) {
-			len = 1;
-		} else if ((*p_utf8 & 0xe0) == 0xc0) {
-			len = 2;
-		} else if ((*p_utf8 & 0xf0) == 0xe0) {
-			len = 3;
-		} else if ((*p_utf8 & 0xf8) == 0xf0) {
-			len = 4;
-		} else {
-			UNICERROR("invalid len");
-			return true; //invalid UTF8
-		}
-
-		if (len > cstr_size) {
-			UNICERROR("no space left");
-			return true; //not enough space
-		}
-
-		if (len == 2 && (*p_utf8 & 0x1E) == 0) {
-			UNICERROR("no space left");
-			return true; //reject overlong
-		}
+		uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8);
 
-		/* Convert the first character */
-
-		uint32_t unichar = 0;
-
-		if (len == 1) {
-			unichar = *p_utf8;
+		if (skip == 0) {
+			if (p_skip_cr && c == '\r') {
+				p_utf8++;
+				continue;
+			}
+			/* Determine the number of characters in sequence */
+			if ((c & 0x80) == 0) {
+				*(dst++) = c;
+				unichar = 0;
+				skip = 0;
+			} else if ((c & 0xe0) == 0xc0) {
+				unichar = (0xff >> 3) & c;
+				skip = 1;
+			} else if ((c & 0xf0) == 0xe0) {
+				unichar = (0xff >> 4) & c;
+				skip = 2;
+			} else if ((c & 0xf8) == 0xf0) {
+				unichar = (0xff >> 5) & c;
+				skip = 3;
+			} else if ((c & 0xfc) == 0xf8) {
+				unichar = (0xff >> 6) & c;
+				skip = 4;
+			} else if ((c & 0xfe) == 0xfc) {
+				unichar = (0xff >> 7) & c;
+				skip = 5;
+			} else {
+				*(dst++) = 0x20;
+				unichar = 0;
+				skip = 0;
+			}
 		} else {
-			unichar = (0xff >> (len + 1)) & *p_utf8;
-
-			for (int i = 1; i < len; i++) {
-				if ((p_utf8[i] & 0xc0) != 0x80) {
-					UNICERROR("invalid utf8");
-					return true; //invalid utf8
-				}
-				if (unichar == 0 && i == 2 && ((p_utf8[i] & 0x7f) >> (7 - len)) == 0) {
-					UNICERROR("invalid utf8 overlong");
-					return true; //no overlong
+			if (c < 0x80 || c > 0xbf) {
+				*(dst++) = 0x20;
+				skip = 0;
+			} else {
+				unichar = (unichar << 6) | (c & 0x3f);
+				--skip;
+				if (skip == 0) {
+					if (unichar == 0) {
+						print_unicode_error("NUL character", true);
+						decode_failed = true;
+						unichar = 0x20;
+					}
+					if ((unichar & 0xfffff800) == 0xd800) {
+						print_unicode_error(vformat("Unpaired surrogate (%x)", unichar));
+						decode_error = true;
+					}
+					if (unichar > 0x10ffff) {
+						print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar));
+						decode_error = true;
+					}
+					*(dst++) = unichar;
 				}
-				unichar = (unichar << 6) | (p_utf8[i] & 0x3f);
 			}
 		}
-		if (unichar >= 0xd800 && unichar <= 0xdfff) {
-			UNICERROR("invalid code point");
-			return CharString();
-		}
 
-		*(dst++) = unichar;
-		cstr_size -= len;
-		p_utf8 += len;
+		cstr_size--;
+		p_utf8++;
+	}
+	if (skip) {
+		*(dst++) = 0x20;
 	}
 
-	return false;
-#undef UNICERROR
+	if (decode_failed) {
+		return ERR_INVALID_DATA;
+	} else if (decode_error) {
+		return ERR_PARSE_ERROR;
+	} else {
+		return OK;
+	}
 }
 
 CharString String::utf8() const {
@@ -1765,15 +1855,17 @@ CharString String::utf8() const {
 			fl += 2;
 		} else if (c <= 0xffff) { // 16 bits
 			fl += 3;
-		} else if (c <= 0x0010ffff) { // 21 bits
+		} else if (c <= 0x001fffff) { // 21 bits
 			fl += 4;
+		} else if (c <= 0x03ffffff) { // 26 bits
+			fl += 5;
+			print_unicode_error(vformat("Invalid unicode codepoint (%x)", c));
+		} else if (c <= 0x7fffffff) { // 31 bits
+			fl += 6;
+			print_unicode_error(vformat("Invalid unicode codepoint (%x)", c));
 		} else {
-			print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
-			return CharString();
-		}
-		if (c >= 0xd800 && c <= 0xdfff) {
-			print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
-			return CharString();
+			fl += 1;
+			print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-8", c), true);
 		}
 	}
 
@@ -1799,11 +1891,26 @@ CharString String::utf8() const {
 			APPEND_CHAR(uint32_t(0xe0 | ((c >> 12) & 0x0f))); // Top 4 bits.
 			APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Middle 6 bits.
 			APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
-		} else { // 21 bits
+		} else if (c <= 0x001fffff) { // 21 bits
 			APPEND_CHAR(uint32_t(0xf0 | ((c >> 18) & 0x07))); // Top 3 bits.
 			APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper middle 6 bits.
 			APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits.
 			APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
+		} else if (c <= 0x03ffffff) { // 26 bits
+			APPEND_CHAR(uint32_t(0xf8 | ((c >> 24) & 0x03))); // Top 2 bits.
+			APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Upper middle 6 bits.
+			APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // middle 6 bits.
+			APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits.
+			APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
+		} else if (c <= 0x7fffffff) { // 31 bits
+			APPEND_CHAR(uint32_t(0xfc | ((c >> 30) & 0x01))); // Top 1 bit.
+			APPEND_CHAR(uint32_t(0x80 | ((c >> 24) & 0x3f))); // Upper upper middle 6 bits.
+			APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Lower upper middle 6 bits.
+			APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper lower middle 6 bits.
+			APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits.
+			APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
+		} else {
+			APPEND_CHAR(0x20);
 		}
 	}
 #undef APPEND_CHAR
@@ -1819,11 +1926,9 @@ String String::utf16(const char16_t *p_utf16, int p_len) {
 	return ret;
 }
 
-bool String::parse_utf16(const char16_t *p_utf16, int p_len) {
-#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-16?");
-
+Error String::parse_utf16(const char16_t *p_utf16, int p_len) {
 	if (!p_utf16) {
-		return true;
+		return ERR_INVALID_DATA;
 	}
 
 	String aux;
@@ -1850,80 +1955,90 @@ bool String::parse_utf16(const char16_t *p_utf16, int p_len) {
 		}
 	}
 
+	bool decode_error = false;
 	{
 		const char16_t *ptrtmp = p_utf16;
 		const char16_t *ptrtmp_limit = &p_utf16[p_len];
-		int skip = 0;
+		uint32_t c_prev = 0;
+		bool skip = false;
 		while (ptrtmp != ptrtmp_limit && *ptrtmp) {
 			uint32_t c = (byteswap) ? BSWAP16(*ptrtmp) : *ptrtmp;
-			if (skip == 0) {
-				if ((c & 0xfffffc00) == 0xd800) {
-					skip = 1; // lead surrogate
-				} else if ((c & 0xfffffc00) == 0xdc00) {
-					UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size));
-					return true; // invalid UTF16
-				} else {
-					skip = 0;
+
+			if ((c & 0xfffffc00) == 0xd800) { // lead surrogate
+				if (skip) {
+					print_unicode_error(vformat("Unpaired lead surrogate (%x [trail?] %x)", c_prev, c));
+					decode_error = true;
 				}
-				str_size++;
-			} else {
-				if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate
-					--skip;
+				skip = true;
+			} else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate
+				if (skip) {
+					str_size--;
 				} else {
-					UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size));
-					return true; // invalid UTF16
+					print_unicode_error(vformat("Unpaired trail surrogate (%x [lead?] %x)", c_prev, c));
+					decode_error = true;
 				}
+				skip = false;
+			} else {
+				skip = false;
 			}
 
+			c_prev = c;
+			str_size++;
 			cstr_size++;
 			ptrtmp++;
 		}
 
 		if (skip) {
-			UNICERROR("no space left");
-			return true; // not enough space
+			print_unicode_error(vformat("Unpaired lead surrogate (%x [eol])", c_prev));
+			decode_error = true;
 		}
 	}
 
 	if (str_size == 0) {
 		clear();
-		return false;
+		return OK; // empty string
 	}
 
 	resize(str_size + 1);
 	char32_t *dst = ptrw();
 	dst[str_size] = 0;
 
+	bool skip = false;
+	uint32_t c_prev = 0;
 	while (cstr_size) {
-		int len = 0;
 		uint32_t c = (byteswap) ? BSWAP16(*p_utf16) : *p_utf16;
 
-		if ((c & 0xfffffc00) == 0xd800) {
-			len = 2;
+		if ((c & 0xfffffc00) == 0xd800) { // lead surrogate
+			if (skip) {
+				*(dst++) = c_prev; // unpaired, store as is
+			}
+			skip = true;
+		} else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate
+			if (skip) {
+				*(dst++) = (c_prev << 10UL) + c - ((0xd800 << 10UL) + 0xdc00 - 0x10000); // decode pair
+			} else {
+				*(dst++) = c; // unpaired, store as is
+			}
+			skip = false;
 		} else {
-			len = 1;
+			*(dst++) = c;
+			skip = false;
 		}
 
-		if (len > cstr_size) {
-			UNICERROR("no space left");
-			return true; //not enough space
-		}
-
-		uint32_t unichar = 0;
-		if (len == 1) {
-			unichar = c;
-		} else {
-			uint32_t c2 = (byteswap) ? BSWAP16(p_utf16[1]) : p_utf16[1];
-			unichar = (c << 10UL) + c2 - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
-		}
+		cstr_size--;
+		p_utf16++;
+		c_prev = c;
+	}
 
-		*(dst++) = unichar;
-		cstr_size -= len;
-		p_utf16 += len;
+	if (skip) {
+		*(dst++) = c_prev;
 	}
 
-	return false;
-#undef UNICERROR
+	if (decode_error) {
+		return ERR_PARSE_ERROR;
+	} else {
+		return OK;
+	}
 }
 
 Char16String String::utf16() const {
@@ -1938,15 +2053,14 @@ Char16String String::utf16() const {
 		uint32_t c = d[i];
 		if (c <= 0xffff) { // 16 bits.
 			fl += 1;
+			if ((c & 0xfffff800) == 0xd800) {
+				print_unicode_error(vformat("Unpaired surrogate (%x)", c));
+			}
 		} else if (c <= 0x10ffff) { // 32 bits.
 			fl += 2;
 		} else {
-			print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
-			return Char16String();
-		}
-		if (c >= 0xd800 && c <= 0xdfff) {
-			print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + ".");
-			return Char16String();
+			print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-16", c), true);
+			fl += 1;
 		}
 	}
 
@@ -1965,9 +2079,11 @@ Char16String String::utf16() const {
 
 		if (c <= 0xffff) { // 16 bits.
 			APPEND_CHAR(c);
-		} else { // 32 bits.
+		} else if (c <= 0x10ffff) { // 32 bits.
 			APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate.
 			APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate.
+		} else {
+			APPEND_CHAR(0x20);
 		}
 	}
 #undef APPEND_CHAR
@@ -2039,7 +2155,7 @@ int64_t String::hex_to_int() const {
 		}
 		// Check for overflow/underflow, with special case to ensure INT64_MIN does not result in error
 		bool overflow = ((hex > INT64_MAX / 16) && (sign == 1 || (sign == -1 && hex != (INT64_MAX >> 4) + 1))) || (sign == -1 && hex == (INT64_MAX >> 4) + 1 && c > '0');
-		ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + *this + " as 64-bit integer, provided value is " + (sign == 1 ? "too big." : "too small."));
+		ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + *this + " as a 64-bit signed integer, since the value is " + (sign == 1 ? "too large." : "too small."));
 		hex *= 16;
 		hex += n;
 		s++;
@@ -2078,7 +2194,7 @@ int64_t String::bin_to_int() const {
 		}
 		// Check for overflow/underflow, with special case to ensure INT64_MIN does not result in error
 		bool overflow = ((binary > INT64_MAX / 2) && (sign == 1 || (sign == -1 && binary != (INT64_MAX >> 1) + 1))) || (sign == -1 && binary == (INT64_MAX >> 1) + 1 && c > '0');
-		ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + *this + " as 64-bit integer, provided value is " + (sign == 1 ? "too big." : "too small."));
+		ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + *this + " as a 64-bit signed integer, since the value is " + (sign == 1 ? "too large." : "too small."));
 		binary *= 2;
 		binary += n;
 		s++;
@@ -2101,7 +2217,7 @@ int64_t String::to_int() const {
 		char32_t c = operator[](i);
 		if (is_digit(c)) {
 			bool overflow = (integer > INT64_MAX / 10) || (integer == INT64_MAX / 10 && ((sign == 1 && c > '7') || (sign == -1 && c > '8')));
-			ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + *this + " as 64-bit integer, provided value is " + (sign == 1 ? "too big." : "too small."));
+			ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + *this + " as a 64-bit signed integer, since the value is " + (sign == 1 ? "too large." : "too small."));
 			integer *= 10;
 			integer += c - '0';
 
@@ -2130,7 +2246,7 @@ int64_t String::to_int(const char *p_str, int p_len) {
 		char c = p_str[i];
 		if (is_digit(c)) {
 			bool overflow = (integer > INT64_MAX / 10) || (integer == INT64_MAX / 10 && ((sign == 1 && c > '7') || (sign == -1 && c > '8')));
-			ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + String(p_str).substr(0, to) + " as integer, provided value is " + (sign == 1 ? "too big." : "too small."));
+			ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + String(p_str).substr(0, to) + " as a 64-bit signed integer, since the value is " + (sign == 1 ? "too large." : "too small."));
 			integer *= 10;
 			integer += c - '0';
 
@@ -2161,7 +2277,7 @@ int64_t String::to_int(const wchar_t *p_str, int p_len) {
 		wchar_t c = p_str[i];
 		if (is_digit(c)) {
 			bool overflow = (integer > INT64_MAX / 10) || (integer == INT64_MAX / 10 && ((sign == 1 && c > '7') || (sign == -1 && c > '8')));
-			ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + String(p_str).substr(0, to) + " as integer, provided value is " + (sign == 1 ? "too big." : "too small."));
+			ERR_FAIL_COND_V_MSG(overflow, sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + String(p_str).substr(0, to) + " as a 64-bit signed integer, since the value is " + (sign == 1 ? "too large." : "too small."));
 			integer *= 10;
 			integer += c - '0';
 
@@ -2483,7 +2599,7 @@ int64_t String::to_int(const char32_t *p_str, int p_len, bool p_clamp) {
 								return INT64_MIN;
 							}
 						} else {
-							ERR_FAIL_V_MSG(sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + number + " as integer, provided value is " + (sign == 1 ? "too big." : "too small."));
+							ERR_FAIL_V_MSG(sign == 1 ? INT64_MAX : INT64_MIN, "Cannot represent " + number + " as a 64-bit signed integer, since the value is " + (sign == 1 ? "too large." : "too small."));
 						}
 					}
 					integer *= 10;
@@ -3344,51 +3460,52 @@ String String::replacen(const String &p_key, const String &p_with) const {
 String String::repeat(int p_count) const {
 	ERR_FAIL_COND_V_MSG(p_count < 0, "", "Parameter count should be a positive number.");
 
-	String new_string;
-	const char32_t *src = this->get_data();
-
-	new_string.resize(length() * p_count + 1);
-	new_string[length() * p_count] = 0;
-
-	for (int i = 0; i < p_count; i++) {
-		for (int j = 0; j < length(); j++) {
-			new_string[i * length() + j] = src[j];
-		}
-	}
-
+	int len = length();
+	String new_string = *this;
+	new_string.resize(p_count * len + 1);
+
+	char32_t *dst = new_string.ptrw();
+	int offset = 1;
+	int stride = 1;
+	while (offset < p_count) {
+		memcpy(dst + offset * len, dst, stride * len * sizeof(char32_t));
+		offset += stride;
+		stride = MIN(stride * 2, p_count - offset);
+	}
+	dst[p_count * len] = _null;
 	return new_string;
 }
 
-String String::left(int p_pos) const {
-	if (p_pos < 0) {
-		p_pos = length() + p_pos;
+String String::left(int p_len) const {
+	if (p_len < 0) {
+		p_len = length() + p_len;
 	}
 
-	if (p_pos <= 0) {
+	if (p_len <= 0) {
 		return "";
 	}
 
-	if (p_pos >= length()) {
+	if (p_len >= length()) {
 		return *this;
 	}
 
-	return substr(0, p_pos);
+	return substr(0, p_len);
 }
 
-String String::right(int p_pos) const {
-	if (p_pos < 0) {
-		p_pos = length() + p_pos;
+String String::right(int p_len) const {
+	if (p_len < 0) {
+		p_len = length() + p_len;
 	}
 
-	if (p_pos <= 0) {
+	if (p_len <= 0) {
 		return "";
 	}
 
-	if (p_pos >= length()) {
+	if (p_len >= length()) {
 		return *this;
 	}
 
-	return substr(length() - p_pos);
+	return substr(length() - p_len);
 }
 
 char32_t String::unicode_at(int p_idx) const {
@@ -3655,6 +3772,31 @@ bool String::is_absolute_path() const {
 	}
 }
 
+static _FORCE_INLINE_ bool _is_valid_identifier_bit(int p_index, char32_t p_char) {
+	if (p_index == 0 && is_digit(p_char)) {
+		return false; // No start with number plz.
+	}
+	return is_ascii_identifier_char(p_char);
+}
+
+String String::validate_identifier() const {
+	if (is_empty()) {
+		return "_"; // Empty string is not a valid identifier;
+	}
+
+	String result = *this;
+	int len = result.length();
+	char32_t *buffer = result.ptrw();
+
+	for (int i = 0; i < len; i++) {
+		if (!_is_valid_identifier_bit(i, buffer[i])) {
+			buffer[i] = '_';
+		}
+	}
+
+	return result;
+}
+
 bool String::is_valid_identifier() const {
 	int len = length();
 
@@ -3665,15 +3807,7 @@ bool String::is_valid_identifier() const {
 	const char32_t *str = &operator[](0);
 
 	for (int i = 0; i < len; i++) {
-		if (i == 0) {
-			if (is_digit(str[0])) {
-				return false; // no start with number plz
-			}
-		}
-
-		bool valid_char = is_ascii_identifier_char(str[i]);
-
-		if (!valid_char) {
+		if (!_is_valid_identifier_bit(i, str[i])) {
 			return false;
 		}
 	}
@@ -3695,18 +3829,15 @@ String String::uri_encode() const {
 	const CharString temp = utf8();
 	String res;
 	for (int i = 0; i < temp.length(); ++i) {
-		char ord = temp[i];
+		uint8_t ord = temp[i];
 		if (ord == '.' || ord == '-' || ord == '~' || is_ascii_identifier_char(ord)) {
 			res += ord;
 		} else {
-			char h_Val[3];
-#if defined(__GNUC__) || defined(_MSC_VER)
-			snprintf(h_Val, 3, "%02hhX", ord);
-#else
-			sprintf(h_Val, "%02hhX", ord);
-#endif
-			res += "%";
-			res += h_Val;
+			char p[4] = { '%', 0, 0, 0 };
+			static const char hex[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+			p[1] = hex[ord >> 4];
+			p[2] = hex[ord & 0xF];
+			res += p;
 		}
 	}
 	return res;
@@ -4329,7 +4460,7 @@ String String::get_extension() const {
 	return substr(pos + 1, length());
 }
 
-String String::plus_file(const String &p_file) const {
+String String::path_join(const String &p_file) const {
 	if (is_empty()) {
 		return p_file;
 	}
@@ -4545,6 +4676,71 @@ String String::sprintf(const Array &values, bool *error) const {
 					in_format = false;
 					break;
 				}
+				case 'v': { // Vector2/3/4/2i/3i/4i
+					if (value_index >= values.size()) {
+						return "not enough arguments for format string";
+					}
+
+					int count;
+					switch (values[value_index].get_type()) {
+						case Variant::VECTOR2:
+						case Variant::VECTOR2I: {
+							count = 2;
+						} break;
+						case Variant::VECTOR3:
+						case Variant::VECTOR3I: {
+							count = 3;
+						} break;
+						case Variant::VECTOR4:
+						case Variant::VECTOR4I: {
+							count = 4;
+						} break;
+						default: {
+							return "%v requires a vector type (Vector2/3/4/2i/3i/4i)";
+						}
+					}
+
+					Vector4 vec = values[value_index];
+					String str = "(";
+					for (int i = 0; i < count; i++) {
+						double val = vec[i];
+						// Pad decimals out.
+						String number_str = String::num(ABS(val), min_decimals).pad_decimals(min_decimals);
+
+						int initial_len = number_str.length();
+
+						// Padding. Leave room for sign later if required.
+						int pad_chars_count = val < 0 ? min_chars - 1 : min_chars;
+						String pad_char = pad_with_zeros ? String("0") : String(" ");
+						if (left_justified) {
+							number_str = number_str.rpad(pad_chars_count, pad_char);
+						} else {
+							number_str = number_str.lpad(pad_chars_count, pad_char);
+						}
+
+						// Add sign if needed.
+						if (val < 0) {
+							if (left_justified) {
+								number_str = number_str.insert(0, "-");
+							} else {
+								number_str = number_str.insert(pad_with_zeros ? 0 : number_str.length() - initial_len, "-");
+							}
+						}
+
+						// Add number to combined string
+						str += number_str;
+
+						if (i < count - 1) {
+							str += ", ";
+						}
+					}
+					str += ")";
+
+					formatted += str;
+					++value_index;
+					in_format = false;
+					break;
+				}
 				case 's': { // String
 					if (value_index >= values.size()) {
 						return "not enough arguments for format string";
@@ -4637,7 +4833,7 @@ String String::sprintf(const Array &values, bool *error) const {
 					}
 					break;
 				}
-				case '.': { // Float separator.
+				case '.': { // Float/Vector separator.
 					if (in_decimals) {
 						return "too many decimal points in format";
 					}
@@ -4651,8 +4847,12 @@ String String::sprintf(const Array &values, bool *error) const {
 						return "not enough arguments for format string";
 					}
 
-					if (!values[value_index].is_num()) {
-						return "* wants number";
+					Variant::Type value_type = values[value_index].get_type();
+					if (!values[value_index].is_num() &&
+							value_type != Variant::VECTOR2 && value_type != Variant::VECTOR2I &&
+							value_type != Variant::VECTOR3 && value_type != Variant::VECTOR3I &&
+							value_type != Variant::VECTOR4 && value_type != Variant::VECTOR4I) {
+						return "* wants number or vector";
 					}
 
 					int size = values[value_index];