summaryrefslogtreecommitdiff
path: root/modules/gdscript/gdscript_tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'modules/gdscript/gdscript_tokenizer.cpp')
-rw-r--r--modules/gdscript/gdscript_tokenizer.cpp140
1 files changed, 100 insertions, 40 deletions
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index d4a098811a..63fad0d9bb 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -5,8 +5,8 @@
/* GODOT ENGINE */
/* https://godotengine.org */
/*************************************************************************/
-/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */
-/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */
+/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur. */
+/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md). */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
@@ -312,22 +312,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::pop_error() {
return error;
}
-static bool _is_alphanumeric(char32_t c) {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
-}
-
-static bool _is_digit(char32_t c) {
- return (c >= '0' && c <= '9');
-}
-
-static bool _is_hex_digit(char32_t c) {
- return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
-}
-
-static bool _is_binary_digit(char32_t c) {
- return (c == '0' || c == '1');
-}
-
GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
Token token(p_type);
token.start_line = start_line;
@@ -448,10 +432,10 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To
}
GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
- if (!_is_alphanumeric(_peek())) {
+ if (!is_ascii_identifier_char(_peek())) {
push_error("Expected annotation identifier after \"@\".");
}
- while (_is_alphanumeric(_peek())) {
+ while (is_ascii_identifier_char(_peek())) {
// Consume all identifier characters.
_advance();
}
@@ -526,7 +510,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
#define MAX_KEYWORD_LENGTH 10
// Consume all alphanumeric characters.
- while (_is_alphanumeric(_peek())) {
+ while (is_ascii_identifier_char(_peek())) {
_advance();
}
@@ -612,7 +596,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
bool has_decimal = false;
bool has_exponent = false;
bool has_error = false;
- bool (*digit_check_func)(char32_t) = _is_digit;
+ bool (*digit_check_func)(char32_t) = is_digit;
if (_peek(-1) == '.') {
has_decimal = true;
@@ -620,20 +604,20 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
if (_peek() == 'x') {
// Hexadecimal.
base = 16;
- digit_check_func = _is_hex_digit;
+ digit_check_func = is_hex_digit;
_advance();
} else if (_peek() == 'b') {
// Binary.
base = 2;
- digit_check_func = _is_binary_digit;
+ digit_check_func = is_binary_digit;
_advance();
}
}
// Allow '_' to be used in a number, for readability.
bool previous_was_underscore = false;
- while (digit_check_func(_peek()) || _peek() == '_') {
- if (_peek() == '_') {
+ while (digit_check_func(_peek()) || is_underscore(_peek())) {
+ if (is_underscore(_peek())) {
if (previous_was_underscore) {
Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
error.start_column = column;
@@ -682,7 +666,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
_advance();
// Consume decimal digits.
- while (_is_digit(_peek()) || _peek() == '_') {
+ while (is_digit(_peek()) || is_underscore(_peek())) {
_advance();
}
}
@@ -696,7 +680,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
_advance();
}
// Consume exponent digits.
- if (!_is_digit(_peek())) {
+ if (!is_digit(_peek())) {
Token error = make_error(R"(Expected exponent value after "e".)");
error.start_column = column;
error.leftmost_column = column;
@@ -705,8 +689,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
push_error(error);
}
previous_was_underscore = false;
- while (_is_digit(_peek()) || _peek() == '_') {
- if (_peek() == '_') {
+ while (is_digit(_peek()) || is_underscore(_peek())) {
+ if (is_underscore(_peek())) {
if (previous_was_underscore) {
Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
error.start_column = column;
@@ -733,7 +717,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
error.rightmost_column = column + 1;
push_error(error);
has_error = true;
- } else if (_is_alphanumeric(_peek())) {
+ } else if (is_ascii_identifier_char(_peek())) {
// Letter at the end of the number.
push_error("Invalid numeric notation.");
}
@@ -786,6 +770,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
}
String result;
+ char32_t prev = 0;
+ int prev_pos = 0;
for (;;) {
// Consume actual string.
@@ -795,6 +781,15 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
char32_t ch = _peek();
+ if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
+ Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
+ error.start_column = column;
+ error.leftmost_column = error.start_column;
+ error.end_column = column + 1;
+ error.rightmost_column = error.end_column;
+ push_error(error);
+ }
+
if (ch == '\\') {
// Escape pattern.
_advance();
@@ -843,16 +838,18 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
case '\\':
escaped = '\\';
break;
- case 'u':
+ case 'U':
+ case 'u': {
// Hexadecimal sequence.
- for (int i = 0; i < 4; i++) {
+ int hex_len = (code == 'U') ? 6 : 4;
+ for (int j = 0; j < hex_len; j++) {
if (_is_at_end()) {
return make_error("Unterminated string.");
}
char32_t digit = _peek();
char32_t value = 0;
- if (digit >= '0' && digit <= '9') {
+ if (is_digit(digit)) {
value = digit - '0';
} else if (digit >= 'a' && digit <= 'f') {
value = digit - 'a';
@@ -877,7 +874,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
_advance();
}
- break;
+ } break;
case '\r':
if (_peek() != '\n') {
// Carriage return without newline in string. (???)
@@ -900,11 +897,53 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
valid_escape = false;
break;
}
+ // Parse UTF-16 pair.
+ if (valid_escape) {
+ if ((escaped & 0xfffffc00) == 0xd800) {
+ if (prev == 0) {
+ prev = escaped;
+ prev_pos = column - 2;
+ continue;
+ } else {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = column - 2;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ valid_escape = false;
+ prev = 0;
+ }
+ } else if ((escaped & 0xfffffc00) == 0xdc00) {
+ if (prev == 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
+ error.start_column = column - 2;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ valid_escape = false;
+ } else {
+ escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+ prev = 0;
+ }
+ }
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
+ }
if (valid_escape) {
result += escaped;
}
} else if (ch == quote_char) {
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
_advance();
if (is_multiline) {
if (_peek() == quote_char && _peek(1) == quote_char) {
@@ -921,6 +960,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
break;
}
} else {
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
result += ch;
_advance();
if (ch == '\n') {
@@ -928,6 +974,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
}
}
}
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
// Make the literal.
Variant string;
@@ -1064,7 +1117,8 @@ void GDScriptTokenizer::check_indent() {
// First time indenting, choose character now.
indent_char = current_indent_char;
} else if (current_indent_char != indent_char) {
- Token error = make_error(vformat("Used \"%s\" for indentation instead \"%s\" as used before in the file.", String(&current_indent_char, 1).c_escape(), String(&indent_char, 1).c_escape()));
+ Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file.",
+ _get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char)));
error.start_line = line;
error.start_column = 1;
error.leftmost_column = 1;
@@ -1114,6 +1168,12 @@ void GDScriptTokenizer::check_indent() {
}
}
+String GDScriptTokenizer::_get_indent_char_name(char32_t ch) {
+ ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String(&ch, 1).c_escape());
+
+ return ch == ' ' ? "space" : "tab";
+}
+
void GDScriptTokenizer::_skip_whitespace() {
if (pending_indents != 0) {
// Still have some indent/dedent tokens to give.
@@ -1246,9 +1306,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
line_continuation = false;
- if (_is_digit(c)) {
+ if (is_digit(c)) {
return number();
- } else if (_is_alphanumeric(c)) {
+ } else if (is_ascii_identifier_char(c)) {
return potential_identifier();
}
@@ -1316,7 +1376,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
if (_peek() == '.') {
_advance();
return make_token(Token::PERIOD_PERIOD);
- } else if (_is_digit(_peek())) {
+ } else if (is_digit(_peek())) {
// Number starting with '.'.
return number();
} else {
@@ -1433,7 +1493,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
}
default:
- return make_error(vformat(R"(Unknown character "%s".")", String(&c, 1)));
+ return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
}
}