diff options
Diffstat (limited to 'modules/gdscript/gdscript_tokenizer.cpp')
-rw-r--r-- | modules/gdscript/gdscript_tokenizer.cpp | 140 |
1 files changed, 100 insertions, 40 deletions
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp index d4a098811a..63fad0d9bb 100644 --- a/modules/gdscript/gdscript_tokenizer.cpp +++ b/modules/gdscript/gdscript_tokenizer.cpp @@ -5,8 +5,8 @@ /* GODOT ENGINE */ /* https://godotengine.org */ /*************************************************************************/ -/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ -/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* Copyright (c) 2007-2022 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2022 Godot Engine contributors (cf. AUTHORS.md). */ /* */ /* Permission is hereby granted, free of charge, to any person obtaining */ /* a copy of this software and associated documentation files (the */ @@ -312,22 +312,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::pop_error() { return error; } -static bool _is_alphanumeric(char32_t c) { - return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; -} - -static bool _is_digit(char32_t c) { - return (c >= '0' && c <= '9'); -} - -static bool _is_hex_digit(char32_t c) { - return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); -} - -static bool _is_binary_digit(char32_t c) { - return (c == '0' || c == '1'); -} - GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) { Token token(p_type); token.start_line = start_line; @@ -448,10 +432,10 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To } GDScriptTokenizer::Token GDScriptTokenizer::annotation() { - if (!_is_alphanumeric(_peek())) { + if (!is_ascii_identifier_char(_peek())) { push_error("Expected annotation identifier after \"@\"."); } - while (_is_alphanumeric(_peek())) { + while (is_ascii_identifier_char(_peek())) { // Consume all identifier characters. _advance(); } @@ -526,7 +510,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { #define MAX_KEYWORD_LENGTH 10 // Consume all alphanumeric characters. - while (_is_alphanumeric(_peek())) { + while (is_ascii_identifier_char(_peek())) { _advance(); } @@ -612,7 +596,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { bool has_decimal = false; bool has_exponent = false; bool has_error = false; - bool (*digit_check_func)(char32_t) = _is_digit; + bool (*digit_check_func)(char32_t) = is_digit; if (_peek(-1) == '.') { has_decimal = true; @@ -620,20 +604,20 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { if (_peek() == 'x') { // Hexadecimal. base = 16; - digit_check_func = _is_hex_digit; + digit_check_func = is_hex_digit; _advance(); } else if (_peek() == 'b') { // Binary. base = 2; - digit_check_func = _is_binary_digit; + digit_check_func = is_binary_digit; _advance(); } } // Allow '_' to be used in a number, for readability. bool previous_was_underscore = false; - while (digit_check_func(_peek()) || _peek() == '_') { - if (_peek() == '_') { + while (digit_check_func(_peek()) || is_underscore(_peek())) { + if (is_underscore(_peek())) { if (previous_was_underscore) { Token error = make_error(R"(Only one underscore can be used as a numeric separator.)"); error.start_column = column; @@ -682,7 +666,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { _advance(); // Consume decimal digits. - while (_is_digit(_peek()) || _peek() == '_') { + while (is_digit(_peek()) || is_underscore(_peek())) { _advance(); } } @@ -696,7 +680,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { _advance(); } // Consume exponent digits. - if (!_is_digit(_peek())) { + if (!is_digit(_peek())) { Token error = make_error(R"(Expected exponent value after "e".)"); error.start_column = column; error.leftmost_column = column; @@ -705,8 +689,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { push_error(error); } previous_was_underscore = false; - while (_is_digit(_peek()) || _peek() == '_') { - if (_peek() == '_') { + while (is_digit(_peek()) || is_underscore(_peek())) { + if (is_underscore(_peek())) { if (previous_was_underscore) { Token error = make_error(R"(Only one underscore can be used as a numeric separator.)"); error.start_column = column; @@ -733,7 +717,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { error.rightmost_column = column + 1; push_error(error); has_error = true; - } else if (_is_alphanumeric(_peek())) { + } else if (is_ascii_identifier_char(_peek())) { // Letter at the end of the number. push_error("Invalid numeric notation."); } @@ -786,6 +770,8 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { } String result; + char32_t prev = 0; + int prev_pos = 0; for (;;) { // Consume actual string. @@ -795,6 +781,15 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { char32_t ch = _peek(); + if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) { + Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion."); + error.start_column = column; + error.leftmost_column = error.start_column; + error.end_column = column + 1; + error.rightmost_column = error.end_column; + push_error(error); + } + if (ch == '\\') { // Escape pattern. _advance(); @@ -843,16 +838,18 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { case '\\': escaped = '\\'; break; - case 'u': + case 'U': + case 'u': { // Hexadecimal sequence. - for (int i = 0; i < 4; i++) { + int hex_len = (code == 'U') ? 6 : 4; + for (int j = 0; j < hex_len; j++) { if (_is_at_end()) { return make_error("Unterminated string."); } char32_t digit = _peek(); char32_t value = 0; - if (digit >= '0' && digit <= '9') { + if (is_digit(digit)) { value = digit - '0'; } else if (digit >= 'a' && digit <= 'f') { value = digit - 'a'; @@ -877,7 +874,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { _advance(); } - break; + } break; case '\r': if (_peek() != '\n') { // Carriage return without newline in string. (???) @@ -900,11 +897,53 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { valid_escape = false; break; } + // Parse UTF-16 pair. + if (valid_escape) { + if ((escaped & 0xfffffc00) == 0xd800) { + if (prev == 0) { + prev = escaped; + prev_pos = column - 2; + continue; + } else { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate"); + error.start_column = column - 2; + error.leftmost_column = error.start_column; + push_error(error); + valid_escape = false; + prev = 0; + } + } else if ((escaped & 0xfffffc00) == 0xdc00) { + if (prev == 0) { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate"); + error.start_column = column - 2; + error.leftmost_column = error.start_column; + push_error(error); + valid_escape = false; + } else { + escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000); + prev = 0; + } + } + if (prev != 0) { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate"); + error.start_column = prev_pos; + error.leftmost_column = error.start_column; + push_error(error); + prev = 0; + } + } if (valid_escape) { result += escaped; } } else if (ch == quote_char) { + if (prev != 0) { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate"); + error.start_column = prev_pos; + error.leftmost_column = error.start_column; + push_error(error); + prev = 0; + } _advance(); if (is_multiline) { if (_peek() == quote_char && _peek(1) == quote_char) { @@ -921,6 +960,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { break; } } else { + if (prev != 0) { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate"); + error.start_column = prev_pos; + error.leftmost_column = error.start_column; + push_error(error); + prev = 0; + } result += ch; _advance(); if (ch == '\n') { @@ -928,6 +974,13 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() { } } } + if (prev != 0) { + Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate"); + error.start_column = prev_pos; + error.leftmost_column = error.start_column; + push_error(error); + prev = 0; + } // Make the literal. Variant string; @@ -1064,7 +1117,8 @@ void GDScriptTokenizer::check_indent() { // First time indenting, choose character now. indent_char = current_indent_char; } else if (current_indent_char != indent_char) { - Token error = make_error(vformat("Used \"%s\" for indentation instead \"%s\" as used before in the file.", String(¤t_indent_char, 1).c_escape(), String(&indent_char, 1).c_escape())); + Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file.", + _get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char))); error.start_line = line; error.start_column = 1; error.leftmost_column = 1; @@ -1114,6 +1168,12 @@ void GDScriptTokenizer::check_indent() { } } +String GDScriptTokenizer::_get_indent_char_name(char32_t ch) { + ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String(&ch, 1).c_escape()); + + return ch == ' ' ? "space" : "tab"; +} + void GDScriptTokenizer::_skip_whitespace() { if (pending_indents != 0) { // Still have some indent/dedent tokens to give. @@ -1246,9 +1306,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { line_continuation = false; - if (_is_digit(c)) { + if (is_digit(c)) { return number(); - } else if (_is_alphanumeric(c)) { + } else if (is_ascii_identifier_char(c)) { return potential_identifier(); } @@ -1316,7 +1376,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { if (_peek() == '.') { _advance(); return make_token(Token::PERIOD_PERIOD); - } else if (_is_digit(_peek())) { + } else if (is_digit(_peek())) { // Number starting with '.'. return number(); } else { @@ -1433,7 +1493,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { } default: - return make_error(vformat(R"(Unknown character "%s".")", String(&c, 1))); + return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1))); } } |