1 files changed, 80 insertions, 36 deletions
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index 05ea061798..d3287ab345 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -312,22 +312,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::pop_error() {
 	return error;
 }
 
-static bool _is_alphanumeric(char32_t c) {
-	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
-}
-
-static bool _is_digit(char32_t c) {
-	return (c >= '0' && c <= '9');
-}
-
-static bool _is_hex_digit(char32_t c) {
-	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
-}
-
-static bool _is_binary_digit(char32_t c) {
-	return (c == '0' || c == '1');
-}
-
 GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
 	Token token(p_type);
 	token.start_line = start_line;
@@ -448,10 +432,10 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To
 }
 
 GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
-	if (!_is_alphanumeric(_peek())) {
+	if (!is_ascii_identifier_char(_peek())) {
 		push_error("Expected annotation identifier after \"@\".");
 	}
-	while (_is_alphanumeric(_peek())) {
+	while (is_ascii_identifier_char(_peek())) {
 		// Consume all identifier characters.
 		_advance();
 	}
@@ -526,7 +510,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
 #define MAX_KEYWORD_LENGTH 10
 
 	// Consume all alphanumeric characters.
-	while (_is_alphanumeric(_peek())) {
+	while (is_ascii_identifier_char(_peek())) {
 		_advance();
 	}
 
@@ -612,7 +596,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 	bool has_decimal = false;
 	bool has_exponent = false;
 	bool has_error = false;
-	bool (*digit_check_func)(char32_t) = _is_digit;
+	bool (*digit_check_func)(char32_t) = is_digit;
 
 	if (_peek(-1) == '.') {
 		has_decimal = true;
@@ -620,20 +604,20 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 		if (_peek() == 'x') {
 			// Hexadecimal.
 			base = 16;
-			digit_check_func = _is_hex_digit;
+			digit_check_func = is_hex_digit;
 			_advance();
 		} else if (_peek() == 'b') {
 			// Binary.
 			base = 2;
-			digit_check_func = _is_binary_digit;
+			digit_check_func = is_binary_digit;
 			_advance();
 		}
 	}
 
 	// Allow '_' to be used in a number, for readability.
 	bool previous_was_underscore = false;
-	while (digit_check_func(_peek()) || _peek() == '_') {
-		if (_peek() == '_') {
+	while (digit_check_func(_peek()) || is_underscore(_peek())) {
+		if (is_underscore(_peek())) {
 			if (previous_was_underscore) {
 				Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
 				error.start_column = column;
@@ -682,7 +666,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 			_advance();
 
 			// Consume decimal digits.
-			while (_is_digit(_peek()) || _peek() == '_') {
+			while (is_digit(_peek()) || is_underscore(_peek())) {
 				_advance();
 			}
 		}
@@ -696,7 +680,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 				_advance();
 			}
 			// Consume exponent digits.
-			if (!_is_digit(_peek())) {
+			if (!is_digit(_peek())) {
 				Token error = make_error(R"(Expected exponent value after "e".)");
 				error.start_column = column;
 				error.leftmost_column = column;
@@ -705,8 +689,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 				push_error(error);
 			}
 			previous_was_underscore = false;
-			while (_is_digit(_peek()) || _peek() == '_') {
-				if (_peek() == '_') {
+			while (is_digit(_peek()) || is_underscore(_peek())) {
+				if (is_underscore(_peek())) {
 					if (previous_was_underscore) {
 						Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
 						error.start_column = column;
@@ -733,7 +717,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 		error.rightmost_column = column + 1;
 		push_error(error);
 		has_error = true;
-	} else if (_is_alphanumeric(_peek())) {
+	} else if (is_ascii_identifier_char(_peek())) {
 		// Letter at the end of the number.
 		push_error("Invalid numeric notation.");
 	}
@@ -786,6 +770,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 	}
 
 	String result;
+	char32_t prev = 0;
+	int prev_pos = 0;
 
 	for (;;) {
 		// Consume actual string.
@@ -852,16 +838,18 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 				case '\\':
 					escaped = '\\';
 					break;
-				case 'u':
+				case 'U':
+				case 'u': {
 					// Hexadecimal sequence.
-					for (int i = 0; i < 4; i++) {
+					int hex_len = (code == 'U') ? 6 : 4;
+					for (int j = 0; j < hex_len; j++) {
 						if (_is_at_end()) {
 							return make_error("Unterminated string.");
 						}
 
 						char32_t digit = _peek();
 						char32_t value = 0;
-						if (digit >= '0' && digit <= '9') {
+						if (is_digit(digit)) {
 							value = digit - '0';
 						} else if (digit >= 'a' && digit <= 'f') {
 							value = digit - 'a';
@@ -886,7 +874,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 
 						_advance();
 					}
-					break;
+				} break;
 				case '\r':
 					if (_peek() != '\n') {
 						// Carriage return without newline in string. (???)
@@ -909,11 +897,53 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 					valid_escape = false;
 					break;
 			}
+			// Parse UTF-16 pair.
+			if (valid_escape) {
+				if ((escaped & 0xfffffc00) == 0xd800) {
+					if (prev == 0) {
+						prev = escaped;
+						prev_pos = column - 2;
+						continue;
+					} else {
+						Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+						error.start_column = column - 2;
+						error.leftmost_column = error.start_column;
+						push_error(error);
+						valid_escape = false;
+						prev = 0;
+					}
+				} else if ((escaped & 0xfffffc00) == 0xdc00) {
+					if (prev == 0) {
+						Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
+						error.start_column = column - 2;
+						error.leftmost_column = error.start_column;
+						push_error(error);
+						valid_escape = false;
+					} else {
+						escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+						prev = 0;
+					}
+				}
+				if (prev != 0) {
+					Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+					error.start_column = prev_pos;
+					error.leftmost_column = error.start_column;
+					push_error(error);
+					prev = 0;
+				}
+			}
 
 			if (valid_escape) {
 				result += escaped;
 			}
 		} else if (ch == quote_char) {
+			if (prev != 0) {
+				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+				error.start_column = prev_pos;
+				error.leftmost_column = error.start_column;
+				push_error(error);
+				prev = 0;
+			}
 			_advance();
 			if (is_multiline) {
 				if (_peek() == quote_char && _peek(1) == quote_char) {
@@ -930,6 +960,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 				break;
 			}
 		} else {
+			if (prev != 0) {
+				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+				error.start_column = prev_pos;
+				error.leftmost_column = error.start_column;
+				push_error(error);
+				prev = 0;
+			}
 			result += ch;
 			_advance();
 			if (ch == '\n') {
@@ -937,6 +974,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 			}
 		}
 	}
+	if (prev != 0) {
+		Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+		error.start_column = prev_pos;
+		error.leftmost_column = error.start_column;
+		push_error(error);
+		prev = 0;
+	}
 
 	// Make the literal.
 	Variant string;
@@ -1262,9 +1306,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 
 	line_continuation = false;
 
-	if (_is_digit(c)) {
+	if (is_digit(c)) {
 		return number();
-	} else if (_is_alphanumeric(c)) {
+	} else if (is_ascii_identifier_char(c)) {
 		return potential_identifier();
 	}
 
@@ -1332,7 +1376,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 			if (_peek() == '.') {
 				_advance();
 				return make_token(Token::PERIOD_PERIOD);
-			} else if (_is_digit(_peek())) {
+			} else if (is_digit(_peek())) {
 				// Number starting with '.'.
 				return number();
 			} else {