1 files changed, 100 insertions, 40 deletions
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index d4a098811a..63fad0d9bb 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -5,8 +5,8 @@
 /*                           GODOT ENGINE                                */
 /*                      https://godotengine.org                          */
 /*************************************************************************/
-/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur.                 */
-/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md).   */
+/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur.                 */
+/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md).   */
 /*                                                                       */
 /* Permission is hereby granted, free of charge, to any person obtaining */
 /* a copy of this software and associated documentation files (the       */
@@ -312,22 +312,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::pop_error() {
 	return error;
 }
 
-static bool _is_alphanumeric(char32_t c) {
-	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
-}
-
-static bool _is_digit(char32_t c) {
-	return (c >= '0' && c <= '9');
-}
-
-static bool _is_hex_digit(char32_t c) {
-	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
-}
-
-static bool _is_binary_digit(char32_t c) {
-	return (c == '0' || c == '1');
-}
-
 GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
 	Token token(p_type);
 	token.start_line = start_line;
@@ -448,10 +432,10 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To
 }
 
 GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
-	if (!_is_alphanumeric(_peek())) {
+	if (!is_ascii_identifier_char(_peek())) {
 		push_error("Expected annotation identifier after \"@\".");
 	}
-	while (_is_alphanumeric(_peek())) {
+	while (is_ascii_identifier_char(_peek())) {
 		// Consume all identifier characters.
 		_advance();
 	}
@@ -526,7 +510,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
 #define MAX_KEYWORD_LENGTH 10
 
 	// Consume all alphanumeric characters.
-	while (_is_alphanumeric(_peek())) {
+	while (is_ascii_identifier_char(_peek())) {
 		_advance();
 	}
 
@@ -612,7 +596,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 	bool has_decimal = false;
 	bool has_exponent = false;
 	bool has_error = false;
-	bool (*digit_check_func)(char32_t) = _is_digit;
+	bool (*digit_check_func)(char32_t) = is_digit;
 
 	if (_peek(-1) == '.') {
 		has_decimal = true;
@@ -620,20 +604,20 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 		if (_peek() == 'x') {
 			// Hexadecimal.
 			base = 16;
-			digit_check_func = _is_hex_digit;
+			digit_check_func = is_hex_digit;
 			_advance();
 		} else if (_peek() == 'b') {
 			// Binary.
 			base = 2;
-			digit_check_func = _is_binary_digit;
+			digit_check_func = is_binary_digit;
 			_advance();
 		}
 	}
 
 	// Allow '_' to be used in a number, for readability.
 	bool previous_was_underscore = false;
-	while (digit_check_func(_peek()) || _peek() == '_') {
-		if (_peek() == '_') {
+	while (digit_check_func(_peek()) || is_underscore(_peek())) {
+		if (is_underscore(_peek())) {
 			if (previous_was_underscore) {
 				Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
 				error.start_column = column;
@@ -682,7 +666,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 			_advance();
 
 			// Consume decimal digits.
-			while (_is_digit(_peek()) || _peek() == '_') {
+			while (is_digit(_peek()) || is_underscore(_peek())) {
 				_advance();
 			}
 		}
@@ -696,7 +680,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 				_advance();
 			}
 			// Consume exponent digits.
-			if (!_is_digit(_peek())) {
+			if (!is_digit(_peek())) {
 				Token error = make_error(R"(Expected exponent value after "e".)");
 				error.start_column = column;
 				error.leftmost_column = column;
@@ -705,8 +689,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 				push_error(error);
 			}
 			previous_was_underscore = false;
-			while (_is_digit(_peek()) || _peek() == '_') {
-				if (_peek() == '_') {
+			while (is_digit(_peek()) || is_underscore(_peek())) {
+				if (is_underscore(_peek())) {
 					if (previous_was_underscore) {
 						Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
 						error.start_column = column;
@@ -733,7 +717,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
 		error.rightmost_column = column + 1;
 		push_error(error);
 		has_error = true;
-	} else if (_is_alphanumeric(_peek())) {
+	} else if (is_ascii_identifier_char(_peek())) {
 		// Letter at the end of the number.
 		push_error("Invalid numeric notation.");
 	}
@@ -786,6 +770,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 	}
 
 	String result;
+	char32_t prev = 0;
+	int prev_pos = 0;
 
 	for (;;) {
 		// Consume actual string.
@@ -795,6 +781,15 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 
 		char32_t ch = _peek();
 
+		if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
+			Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
+			error.start_column = column;
+			error.leftmost_column = error.start_column;
+			error.end_column = column + 1;
+			error.rightmost_column = error.end_column;
+			push_error(error);
+		}
+
 		if (ch == '\\') {
 			// Escape pattern.
 			_advance();
@@ -843,16 +838,18 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 				case '\\':
 					escaped = '\\';
 					break;
-				case 'u':
+				case 'U':
+				case 'u': {
 					// Hexadecimal sequence.
-					for (int i = 0; i < 4; i++) {
+					int hex_len = (code == 'U') ? 6 : 4;
+					for (int j = 0; j < hex_len; j++) {
 						if (_is_at_end()) {
 							return make_error("Unterminated string.");
 						}
 
 						char32_t digit = _peek();
 						char32_t value = 0;
-						if (digit >= '0' && digit <= '9') {
+						if (is_digit(digit)) {
 							value = digit - '0';
 						} else if (digit >= 'a' && digit <= 'f') {
 							value = digit - 'a';
@@ -877,7 +874,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 
 						_advance();
 					}
-					break;
+				} break;
 				case '\r':
 					if (_peek() != '\n') {
 						// Carriage return without newline in string. (???)
@@ -900,11 +897,53 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 					valid_escape = false;
 					break;
 			}
+			// Parse UTF-16 pair.
+			if (valid_escape) {
+				if ((escaped & 0xfffffc00) == 0xd800) {
+					if (prev == 0) {
+						prev = escaped;
+						prev_pos = column - 2;
+						continue;
+					} else {
+						Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+						error.start_column = column - 2;
+						error.leftmost_column = error.start_column;
+						push_error(error);
+						valid_escape = false;
+						prev = 0;
+					}
+				} else if ((escaped & 0xfffffc00) == 0xdc00) {
+					if (prev == 0) {
+						Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
+						error.start_column = column - 2;
+						error.leftmost_column = error.start_column;
+						push_error(error);
+						valid_escape = false;
+					} else {
+						escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+						prev = 0;
+					}
+				}
+				if (prev != 0) {
+					Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+					error.start_column = prev_pos;
+					error.leftmost_column = error.start_column;
+					push_error(error);
+					prev = 0;
+				}
+			}
 
 			if (valid_escape) {
 				result += escaped;
 			}
 		} else if (ch == quote_char) {
+			if (prev != 0) {
+				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+				error.start_column = prev_pos;
+				error.leftmost_column = error.start_column;
+				push_error(error);
+				prev = 0;
+			}
 			_advance();
 			if (is_multiline) {
 				if (_peek() == quote_char && _peek(1) == quote_char) {
@@ -921,6 +960,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 				break;
 			}
 		} else {
+			if (prev != 0) {
+				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+				error.start_column = prev_pos;
+				error.leftmost_column = error.start_column;
+				push_error(error);
+				prev = 0;
+			}
 			result += ch;
 			_advance();
 			if (ch == '\n') {
@@ -928,6 +974,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
 			}
 		}
 	}
+	if (prev != 0) {
+		Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+		error.start_column = prev_pos;
+		error.leftmost_column = error.start_column;
+		push_error(error);
+		prev = 0;
+	}
 
 	// Make the literal.
 	Variant string;
@@ -1064,7 +1117,8 @@ void GDScriptTokenizer::check_indent() {
 			// First time indenting, choose character now.
 			indent_char = current_indent_char;
 		} else if (current_indent_char != indent_char) {
-			Token error = make_error(vformat("Used \"%s\" for indentation instead \"%s\" as used before in the file.", String(&current_indent_char, 1).c_escape(), String(&indent_char, 1).c_escape()));
+			Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file.",
+					_get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char)));
 			error.start_line = line;
 			error.start_column = 1;
 			error.leftmost_column = 1;
@@ -1114,6 +1168,12 @@ void GDScriptTokenizer::check_indent() {
 	}
 }
 
+String GDScriptTokenizer::_get_indent_char_name(char32_t ch) {
+	ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String(&ch, 1).c_escape());
+
+	return ch == ' ' ? "space" : "tab";
+}
+
 void GDScriptTokenizer::_skip_whitespace() {
 	if (pending_indents != 0) {
 		// Still have some indent/dedent tokens to give.
@@ -1246,9 +1306,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 
 	line_continuation = false;
 
-	if (_is_digit(c)) {
+	if (is_digit(c)) {
 		return number();
-	} else if (_is_alphanumeric(c)) {
+	} else if (is_ascii_identifier_char(c)) {
 		return potential_identifier();
 	}
 
@@ -1316,7 +1376,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 			if (_peek() == '.') {
 				_advance();
 				return make_token(Token::PERIOD_PERIOD);
-			} else if (_is_digit(_peek())) {
+			} else if (is_digit(_peek())) {
 				// Number starting with '.'.
 				return number();
 			} else {
@@ -1433,7 +1493,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
 			}
 
 		default:
-			return make_error(vformat(R"(Unknown character "%s".")", String(&c, 1)));
+			return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
 	}
 }