summaryrefslogtreecommitdiff
path: root/modules/gdscript/gdscript_tokenizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'modules/gdscript/gdscript_tokenizer.cpp')
-rw-r--r--modules/gdscript/gdscript_tokenizer.cpp116
1 files changed, 80 insertions, 36 deletions
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index 05ea061798..d3287ab345 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -312,22 +312,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::pop_error() {
return error;
}
-static bool _is_alphanumeric(char32_t c) {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_';
-}
-
-static bool _is_digit(char32_t c) {
- return (c >= '0' && c <= '9');
-}
-
-static bool _is_hex_digit(char32_t c) {
- return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
-}
-
-static bool _is_binary_digit(char32_t c) {
- return (c == '0' || c == '1');
-}
-
GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
Token token(p_type);
token.start_line = start_line;
@@ -448,10 +432,10 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To
}
GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
- if (!_is_alphanumeric(_peek())) {
+ if (!is_ascii_identifier_char(_peek())) {
push_error("Expected annotation identifier after \"@\".");
}
- while (_is_alphanumeric(_peek())) {
+ while (is_ascii_identifier_char(_peek())) {
// Consume all identifier characters.
_advance();
}
@@ -526,7 +510,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
#define MAX_KEYWORD_LENGTH 10
// Consume all alphanumeric characters.
- while (_is_alphanumeric(_peek())) {
+ while (is_ascii_identifier_char(_peek())) {
_advance();
}
@@ -612,7 +596,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
bool has_decimal = false;
bool has_exponent = false;
bool has_error = false;
- bool (*digit_check_func)(char32_t) = _is_digit;
+ bool (*digit_check_func)(char32_t) = is_digit;
if (_peek(-1) == '.') {
has_decimal = true;
@@ -620,20 +604,20 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
if (_peek() == 'x') {
// Hexadecimal.
base = 16;
- digit_check_func = _is_hex_digit;
+ digit_check_func = is_hex_digit;
_advance();
} else if (_peek() == 'b') {
// Binary.
base = 2;
- digit_check_func = _is_binary_digit;
+ digit_check_func = is_binary_digit;
_advance();
}
}
// Allow '_' to be used in a number, for readability.
bool previous_was_underscore = false;
- while (digit_check_func(_peek()) || _peek() == '_') {
- if (_peek() == '_') {
+ while (digit_check_func(_peek()) || is_underscore(_peek())) {
+ if (is_underscore(_peek())) {
if (previous_was_underscore) {
Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
error.start_column = column;
@@ -682,7 +666,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
_advance();
// Consume decimal digits.
- while (_is_digit(_peek()) || _peek() == '_') {
+ while (is_digit(_peek()) || is_underscore(_peek())) {
_advance();
}
}
@@ -696,7 +680,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
_advance();
}
// Consume exponent digits.
- if (!_is_digit(_peek())) {
+ if (!is_digit(_peek())) {
Token error = make_error(R"(Expected exponent value after "e".)");
error.start_column = column;
error.leftmost_column = column;
@@ -705,8 +689,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
push_error(error);
}
previous_was_underscore = false;
- while (_is_digit(_peek()) || _peek() == '_') {
- if (_peek() == '_') {
+ while (is_digit(_peek()) || is_underscore(_peek())) {
+ if (is_underscore(_peek())) {
if (previous_was_underscore) {
Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
error.start_column = column;
@@ -733,7 +717,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
error.rightmost_column = column + 1;
push_error(error);
has_error = true;
- } else if (_is_alphanumeric(_peek())) {
+ } else if (is_ascii_identifier_char(_peek())) {
// Letter at the end of the number.
push_error("Invalid numeric notation.");
}
@@ -786,6 +770,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
}
String result;
+ char32_t prev = 0;
+ int prev_pos = 0;
for (;;) {
// Consume actual string.
@@ -852,16 +838,18 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
case '\\':
escaped = '\\';
break;
- case 'u':
+ case 'U':
+ case 'u': {
// Hexadecimal sequence.
- for (int i = 0; i < 4; i++) {
+ int hex_len = (code == 'U') ? 6 : 4;
+ for (int j = 0; j < hex_len; j++) {
if (_is_at_end()) {
return make_error("Unterminated string.");
}
char32_t digit = _peek();
char32_t value = 0;
- if (digit >= '0' && digit <= '9') {
+ if (is_digit(digit)) {
value = digit - '0';
} else if (digit >= 'a' && digit <= 'f') {
value = digit - 'a';
@@ -886,7 +874,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
_advance();
}
- break;
+ } break;
case '\r':
if (_peek() != '\n') {
// Carriage return without newline in string. (???)
@@ -909,11 +897,53 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
valid_escape = false;
break;
}
+ // Parse UTF-16 pair.
+ if (valid_escape) {
+ if ((escaped & 0xfffffc00) == 0xd800) {
+ if (prev == 0) {
+ prev = escaped;
+ prev_pos = column - 2;
+ continue;
+ } else {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = column - 2;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ valid_escape = false;
+ prev = 0;
+ }
+ } else if ((escaped & 0xfffffc00) == 0xdc00) {
+ if (prev == 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
+ error.start_column = column - 2;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ valid_escape = false;
+ } else {
+ escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
+ prev = 0;
+ }
+ }
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
+ }
if (valid_escape) {
result += escaped;
}
} else if (ch == quote_char) {
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
_advance();
if (is_multiline) {
if (_peek() == quote_char && _peek(1) == quote_char) {
@@ -930,6 +960,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
break;
}
} else {
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
result += ch;
_advance();
if (ch == '\n') {
@@ -937,6 +974,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
}
}
}
+ if (prev != 0) {
+ Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
+ error.start_column = prev_pos;
+ error.leftmost_column = error.start_column;
+ push_error(error);
+ prev = 0;
+ }
// Make the literal.
Variant string;
@@ -1262,9 +1306,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
line_continuation = false;
- if (_is_digit(c)) {
+ if (is_digit(c)) {
return number();
- } else if (_is_alphanumeric(c)) {
+ } else if (is_ascii_identifier_char(c)) {
return potential_identifier();
}
@@ -1332,7 +1376,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
if (_peek() == '.') {
_advance();
return make_token(Token::PERIOD_PERIOD);
- } else if (_is_digit(_peek())) {
+ } else if (is_digit(_peek())) {
// Number starting with '.'.
return number();
} else {