diff options
author | George Marques <george@gmarqu.es> | 2023-01-18 22:56:00 -0300 |
---|---|---|
committer | George Marques <george@gmarqu.es> | 2023-01-21 13:39:40 -0300 |
commit | 7548e043fce1211e21030bb41c6fe6d6900a7a5a (patch) | |
tree | 4c9284d244a5586dd490dc9bb40b8017b289b584 | |
parent | 2ec0da1a75a066fe88986fc6ceda22fbabf7eedd (diff) |
Add support for Unicode identifiers in GDScript
This is using an adapted version of UAX#31 to not rely on the ICU
database (which isn't available in builds without TextServerAdvanced).
It allows most characters used in diverse scripts but not everything.
14 files changed, 145 insertions, 33 deletions
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml index de41edc305..b18f7f4314 100644 --- a/doc/classes/ProjectSettings.xml +++ b/doc/classes/ProjectSettings.xml @@ -384,6 +384,9 @@ <member name="debug/gdscript/warnings/assert_always_true" type="int" setter="" getter="" default="1"> When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when an [code]assert[/code] call always evaluates to true. </member> + <member name="debug/gdscript/warnings/confusable_identifier" type="int" setter="" getter="" default="1"> + When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when an indentifier contains characters that can be confused with something else, like when mixing different alphabets. + </member> <member name="debug/gdscript/warnings/constant_used_as_function" type="int" setter="" getter="" default="1"> When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when a constant is used as a function. </member> diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp index f5d3306376..4228046ba2 100644 --- a/modules/gdscript/gdscript_parser.cpp +++ b/modules/gdscript/gdscript_parser.cpp @@ -41,6 +41,7 @@ #include "core/os/os.h" #include "core/string/string_builder.h" #include "gdscript_warning.h" +#include "servers/text_server.h" #endif // DEBUG_ENABLED #ifdef TOOLS_ENABLED @@ -186,24 +187,6 @@ void GDScriptParser::push_error(const String &p_message, const Node *p_origin) { } #ifdef DEBUG_ENABLED -void GDScriptParser::push_warning(const Node *p_source, GDScriptWarning::Code p_code, const String &p_symbol1, const String &p_symbol2, const String &p_symbol3, const String &p_symbol4) { - ERR_FAIL_COND(p_source == nullptr); - Vector<String> symbols; - if (!p_symbol1.is_empty()) { - symbols.push_back(p_symbol1); - } - if (!p_symbol2.is_empty()) { - symbols.push_back(p_symbol2); - } - if (!p_symbol3.is_empty()) { - symbols.push_back(p_symbol3); - } - if (!p_symbol4.is_empty()) { - symbols.push_back(p_symbol4); - } - push_warning(p_source, p_code, symbols); -} - void GDScriptParser::push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Vector<String> &p_symbols) { ERR_FAIL_COND(p_source == nullptr); if (is_ignoring_warnings) { @@ -2251,7 +2234,14 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_expression(bool p_can_assi } GDScriptParser::IdentifierNode *GDScriptParser::parse_identifier() { - return static_cast<IdentifierNode *>(parse_identifier(nullptr, false)); + IdentifierNode *identifier = static_cast<IdentifierNode *>(parse_identifier(nullptr, false)); +#ifdef DEBUG_ENABLED + // Check for spoofing here (if available in TextServer) since this isn't called inside expressions. This is only relevant for declarations. + if (identifier && TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY) && TS->spoof_check(identifier->name.operator String())) { + push_warning(identifier, GDScriptWarning::CONFUSABLE_IDENTIFIER, identifier->name.operator String()); + } +#endif + return identifier; } GDScriptParser::ExpressionNode *GDScriptParser::parse_identifier(ExpressionNode *p_previous_operand, bool p_can_assign) { diff --git a/modules/gdscript/gdscript_parser.h b/modules/gdscript/gdscript_parser.h index 0903f62061..f6d2a8feee 100644 --- a/modules/gdscript/gdscript_parser.h +++ b/modules/gdscript/gdscript_parser.h @@ -1361,8 +1361,11 @@ private: void clear(); void push_error(const String &p_message, const Node *p_origin = nullptr); #ifdef DEBUG_ENABLED - void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const String &p_symbol1 = String(), const String &p_symbol2 = String(), const String &p_symbol3 = String(), const String &p_symbol4 = String()); void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Vector<String> &p_symbols); + template <typename... Symbols> + void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Symbols &...p_symbols) { + push_warning(p_source, p_code, Vector<String>{ p_symbols... }); + } #endif void make_completion_context(CompletionType p_type, Node *p_node, int p_argument = -1, bool p_force = false); diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp index e17a804003..d7f1114fd3 100644 --- a/modules/gdscript/gdscript_tokenizer.cpp +++ b/modules/gdscript/gdscript_tokenizer.cpp @@ -31,10 +31,14 @@ #include "gdscript_tokenizer.h" #include "core/error/error_macros.h" +#include "core/string/char_utils.h" #ifdef TOOLS_ENABLED #include "editor/editor_settings.h" #endif +#ifdef DEBUG_ENABLED +#include "servers/text_server.h" +#endif static const char *token_names[] = { "Empty", // EMPTY, @@ -435,10 +439,12 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To } GDScriptTokenizer::Token GDScriptTokenizer::annotation() { - if (!is_ascii_identifier_char(_peek())) { + if (is_unicode_identifier_start(_peek())) { + _advance(); // Consume start character. + } else { push_error("Expected annotation identifier after \"@\"."); } - while (is_ascii_identifier_char(_peek())) { + while (is_unicode_identifier_continue(_peek())) { // Consume all identifier characters. _advance(); } @@ -447,7 +453,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::annotation() { return annotation; } -GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ KEYWORD_GROUP('a') \ KEYWORD("as", Token::AS) \ @@ -512,8 +517,21 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { #define MIN_KEYWORD_LENGTH 2 #define MAX_KEYWORD_LENGTH 10 - // Consume all alphanumeric characters. - while (is_ascii_identifier_char(_peek())) { +#ifdef DEBUG_ENABLED +void GDScriptTokenizer::make_keyword_list() { +#define KEYWORD_LINE(keyword, token_type) keyword, +#define KEYWORD_GROUP_IGNORE(group) + keyword_list = { + KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE) + }; +#undef KEYWORD_LINE +#undef KEYWORD_GROUP_IGNORE +} +#endif // DEBUG_ENABLED + +GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { + // Consume all identifier characters. + while (is_unicode_identifier_continue(_peek())) { _advance(); } @@ -565,15 +583,28 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { } // Not a keyword, so must be an identifier. - return make_identifier(name); + Token id = make_identifier(name); + +#ifdef DEBUG_ENABLED + // Additional checks for identifiers but only in debug and if it's available in TextServer. + if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) { + int64_t confusable = TS->is_confusable(name, keyword_list); + if (confusable >= 0) { + push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable])); + } + } +#endif // DEBUG_ENABLED + + return id; -#undef KEYWORDS -#undef MIN_KEYWORD_LENGTH -#undef MAX_KEYWORD_LENGTH #undef KEYWORD_GROUP_CASE #undef KEYWORD } +#undef MAX_KEYWORD_LENGTH +#undef MIN_KEYWORD_LENGTH +#undef KEYWORDS + void GDScriptTokenizer::newline(bool p_make_token) { // Don't overwrite previous newline, nor create if we want a line continuation. if (p_make_token && !pending_newline && !line_continuation) { @@ -720,7 +751,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { error.rightmost_column = column + 1; push_error(error); has_error = true; - } else if (is_ascii_identifier_char(_peek())) { + } else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) { // Letter at the end of the number. push_error("Invalid numeric notation."); } @@ -1311,7 +1342,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { if (is_digit(c)) { return number(); - } else if (is_ascii_identifier_char(c)) { + } else if (is_unicode_identifier_start(c)) { return potential_identifier(); } @@ -1504,7 +1535,11 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { } default: - return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1))); + if (is_whitespace(c)) { + return make_error(vformat(R"(Invalid white space character "\\u%X".)", static_cast<int32_t>(c))); + } else { + return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1))); + } } } @@ -1514,4 +1549,7 @@ GDScriptTokenizer::GDScriptTokenizer() { tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size"); } #endif // TOOLS_ENABLED +#ifdef DEBUG_ENABLED + make_keyword_list(); +#endif // DEBUG_ENABLED } diff --git a/modules/gdscript/gdscript_tokenizer.h b/modules/gdscript/gdscript_tokenizer.h index 9588922122..608840d3f1 100644 --- a/modules/gdscript/gdscript_tokenizer.h +++ b/modules/gdscript/gdscript_tokenizer.h @@ -224,6 +224,9 @@ private: char32_t indent_char = '\0'; int position = 0; int length = 0; +#ifdef DEBUG_ENABLED + Vector<String> keyword_list; +#endif // DEBUG_ENABLED #ifdef TOOLS_ENABLED HashMap<int, CommentData> comments; @@ -239,6 +242,10 @@ private: void _skip_whitespace(); void check_indent(); +#ifdef DEBUG_ENABLED + void make_keyword_list(); +#endif // DEBUG_ENABLED + Token make_error(const String &p_message); void push_error(const String &p_message); void push_error(const Token &p_error); diff --git a/modules/gdscript/gdscript_warning.cpp b/modules/gdscript/gdscript_warning.cpp index 184cecb316..a6cbb7f6ae 100644 --- a/modules/gdscript/gdscript_warning.cpp +++ b/modules/gdscript/gdscript_warning.cpp @@ -155,6 +155,10 @@ String GDScriptWarning::get_message() const { CHECK_SYMBOLS(2); return vformat(R"(The function '%s()' is a static function but was called from an instance. Instead, it should be directly called from the type: '%s.%s()'.)", symbols[0], symbols[1], symbols[0]); } + case CONFUSABLE_IDENTIFIER: { + CHECK_SYMBOLS(1); + return vformat(R"(The identifier "%s" has misleading characters and might be confused with something else.)", symbols[0]); + } case WARNING_MAX: break; // Can't happen, but silences warning } @@ -219,6 +223,7 @@ String GDScriptWarning::get_name_from_code(Code p_code) { "SHADOWED_GLOBAL_IDENTIFIER", "INT_ASSIGNED_TO_ENUM", "STATIC_CALLED_ON_INSTANCE", + "CONFUSABLE_IDENTIFIER", }; static_assert((sizeof(names) / sizeof(*names)) == WARNING_MAX, "Amount of warning types don't match the amount of warning names."); diff --git a/modules/gdscript/gdscript_warning.h b/modules/gdscript/gdscript_warning.h index e3aee45f33..b485f02b9c 100644 --- a/modules/gdscript/gdscript_warning.h +++ b/modules/gdscript/gdscript_warning.h @@ -78,6 +78,7 @@ public: SHADOWED_GLOBAL_IDENTIFIER, // A global class or function has the same name as variable. INT_ASSIGNED_TO_ENUM, // An integer value was assigned to an enum-typed variable without casting. STATIC_CALLED_ON_INSTANCE, // A static method was called on an instance of a class instead of on the class itself. + CONFUSABLE_IDENTIFIER, // The identifier contains misleading characters that can be confused. E.g. "usеr" (has Cyrillic "е" instead of Latin "e"). WARNING_MAX, }; diff --git a/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out b/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out index b018091c18..32e230fc80 100644 --- a/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out +++ b/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out @@ -2,4 +2,4 @@ GDTEST_OK >> WARNING >> Line: 2 >> UNUSED_PARAMETER ->> +>> The parameter 'unused' is never used in the function ''. If this is intended, prefix it with an underscore: '_unused' diff --git a/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd new file mode 100644 index 0000000000..4b1f284070 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd @@ -0,0 +1,3 @@ +func test(): + var аs # Using Cyrillic "а". + print(аs) diff --git a/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out new file mode 100644 index 0000000000..337dec2f4d --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out @@ -0,0 +1,2 @@ +GDTEST_PARSER_ERROR +Identifier "аs" is visually similar to the GDScript keyword "as" and thus not allowed. diff --git a/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd new file mode 100644 index 0000000000..523959a016 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd @@ -0,0 +1,35 @@ +const π = PI +var ㄥ = π + +func test(): + var փորձարկում = "test" + prints("փորձարկում", փորձարկում) + var امتحان = "test" + prints("امتحان", امتحان) + var পরীক্ষা = "test" + prints("পরীক্ষা", পরীক্ষা) + var тест = "test" + prints("тест", тест) + var जाँच = "test" + prints("जाँच", जाँच) + var 기준 = "test" + prints("기준", 기준) + var 测试 = "test" + prints("测试", 测试) + var テスト = "test" + prints("テスト", テスト) + var 試験 = "test" + prints("試験", 試験) + var പരീക്ഷ = "test" + prints("പരീക്ഷ", പരീക്ഷ) + var ทดสอบ = "test" + prints("ทดสอบ", ทดสอบ) + var δοκιμή = "test" + prints("δοκιμή", δοκιμή) + + const d = 1.1 + _process(d) + print(is_equal_approx(ㄥ, PI + (d * PI))) + +func _process(Δ: float) -> void: + ㄥ += Δ * π diff --git a/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out new file mode 100644 index 0000000000..c071380a8f --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out @@ -0,0 +1,14 @@ +GDTEST_OK +փորձարկում test +امتحان test +পরীক্ষা test +тест test +जाँच test +기준 test +测试 test +テスト test +試験 test +പരീക്ഷ test +ทดสอบ test +δοκιμή test +true diff --git a/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd new file mode 100644 index 0000000000..e2caac8ffd --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd @@ -0,0 +1,5 @@ +func test(): + var port = 0 # Only latin characters. + var pοrt = 1 # The "ο" is Greek omicron. + + prints(port, pοrt) diff --git a/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out new file mode 100644 index 0000000000..c483396443 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out @@ -0,0 +1,6 @@ +GDTEST_OK +>> WARNING +>> Line: 3 +>> CONFUSABLE_IDENTIFIER +>> The identifier "pοrt" has misleading characters and might be confused with something else. +0 1 |