summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorge Marques <george@gmarqu.es>2023-01-18 22:56:00 -0300
committerGeorge Marques <george@gmarqu.es>2023-01-21 13:39:40 -0300
commit7548e043fce1211e21030bb41c6fe6d6900a7a5a (patch)
tree4c9284d244a5586dd490dc9bb40b8017b289b584
parent2ec0da1a75a066fe88986fc6ceda22fbabf7eedd (diff)
Add support for Unicode identifiers in GDScript
This is using an adapted version of UAX#31 to not rely on the ICU database (which isn't available in builds without TextServerAdvanced). It allows most characters used in diverse scripts but not everything.
-rw-r--r--doc/classes/ProjectSettings.xml3
-rw-r--r--modules/gdscript/gdscript_parser.cpp28
-rw-r--r--modules/gdscript/gdscript_parser.h5
-rw-r--r--modules/gdscript/gdscript_tokenizer.cpp62
-rw-r--r--modules/gdscript/gdscript_tokenizer.h7
-rw-r--r--modules/gdscript/gdscript_warning.cpp5
-rw-r--r--modules/gdscript/gdscript_warning.h1
-rw-r--r--modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out2
-rw-r--r--modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd3
-rw-r--r--modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out2
-rw-r--r--modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd35
-rw-r--r--modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out14
-rw-r--r--modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd5
-rw-r--r--modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out6
14 files changed, 145 insertions, 33 deletions
diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml
index de41edc305..b18f7f4314 100644
--- a/doc/classes/ProjectSettings.xml
+++ b/doc/classes/ProjectSettings.xml
@@ -384,6 +384,9 @@
<member name="debug/gdscript/warnings/assert_always_true" type="int" setter="" getter="" default="1">
When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when an [code]assert[/code] call always evaluates to true.
</member>
+ <member name="debug/gdscript/warnings/confusable_identifier" type="int" setter="" getter="" default="1">
+ When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when an indentifier contains characters that can be confused with something else, like when mixing different alphabets.
+ </member>
<member name="debug/gdscript/warnings/constant_used_as_function" type="int" setter="" getter="" default="1">
When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when a constant is used as a function.
</member>
diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp
index f5d3306376..4228046ba2 100644
--- a/modules/gdscript/gdscript_parser.cpp
+++ b/modules/gdscript/gdscript_parser.cpp
@@ -41,6 +41,7 @@
#include "core/os/os.h"
#include "core/string/string_builder.h"
#include "gdscript_warning.h"
+#include "servers/text_server.h"
#endif // DEBUG_ENABLED
#ifdef TOOLS_ENABLED
@@ -186,24 +187,6 @@ void GDScriptParser::push_error(const String &p_message, const Node *p_origin) {
}
#ifdef DEBUG_ENABLED
-void GDScriptParser::push_warning(const Node *p_source, GDScriptWarning::Code p_code, const String &p_symbol1, const String &p_symbol2, const String &p_symbol3, const String &p_symbol4) {
- ERR_FAIL_COND(p_source == nullptr);
- Vector<String> symbols;
- if (!p_symbol1.is_empty()) {
- symbols.push_back(p_symbol1);
- }
- if (!p_symbol2.is_empty()) {
- symbols.push_back(p_symbol2);
- }
- if (!p_symbol3.is_empty()) {
- symbols.push_back(p_symbol3);
- }
- if (!p_symbol4.is_empty()) {
- symbols.push_back(p_symbol4);
- }
- push_warning(p_source, p_code, symbols);
-}
-
void GDScriptParser::push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Vector<String> &p_symbols) {
ERR_FAIL_COND(p_source == nullptr);
if (is_ignoring_warnings) {
@@ -2251,7 +2234,14 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_expression(bool p_can_assi
}
GDScriptParser::IdentifierNode *GDScriptParser::parse_identifier() {
- return static_cast<IdentifierNode *>(parse_identifier(nullptr, false));
+ IdentifierNode *identifier = static_cast<IdentifierNode *>(parse_identifier(nullptr, false));
+#ifdef DEBUG_ENABLED
+ // Check for spoofing here (if available in TextServer) since this isn't called inside expressions. This is only relevant for declarations.
+ if (identifier && TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY) && TS->spoof_check(identifier->name.operator String())) {
+ push_warning(identifier, GDScriptWarning::CONFUSABLE_IDENTIFIER, identifier->name.operator String());
+ }
+#endif
+ return identifier;
}
GDScriptParser::ExpressionNode *GDScriptParser::parse_identifier(ExpressionNode *p_previous_operand, bool p_can_assign) {
diff --git a/modules/gdscript/gdscript_parser.h b/modules/gdscript/gdscript_parser.h
index 0903f62061..f6d2a8feee 100644
--- a/modules/gdscript/gdscript_parser.h
+++ b/modules/gdscript/gdscript_parser.h
@@ -1361,8 +1361,11 @@ private:
void clear();
void push_error(const String &p_message, const Node *p_origin = nullptr);
#ifdef DEBUG_ENABLED
- void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const String &p_symbol1 = String(), const String &p_symbol2 = String(), const String &p_symbol3 = String(), const String &p_symbol4 = String());
void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Vector<String> &p_symbols);
+ template <typename... Symbols>
+ void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Symbols &...p_symbols) {
+ push_warning(p_source, p_code, Vector<String>{ p_symbols... });
+ }
#endif
void make_completion_context(CompletionType p_type, Node *p_node, int p_argument = -1, bool p_force = false);
diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp
index e17a804003..d7f1114fd3 100644
--- a/modules/gdscript/gdscript_tokenizer.cpp
+++ b/modules/gdscript/gdscript_tokenizer.cpp
@@ -31,10 +31,14 @@
#include "gdscript_tokenizer.h"
#include "core/error/error_macros.h"
+#include "core/string/char_utils.h"
#ifdef TOOLS_ENABLED
#include "editor/editor_settings.h"
#endif
+#ifdef DEBUG_ENABLED
+#include "servers/text_server.h"
+#endif
static const char *token_names[] = {
"Empty", // EMPTY,
@@ -435,10 +439,12 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To
}
GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
- if (!is_ascii_identifier_char(_peek())) {
+ if (is_unicode_identifier_start(_peek())) {
+ _advance(); // Consume start character.
+ } else {
push_error("Expected annotation identifier after \"@\".");
}
- while (is_ascii_identifier_char(_peek())) {
+ while (is_unicode_identifier_continue(_peek())) {
// Consume all identifier characters.
_advance();
}
@@ -447,7 +453,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
return annotation;
}
-GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
KEYWORD_GROUP('a') \
KEYWORD("as", Token::AS) \
@@ -512,8 +517,21 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
#define MIN_KEYWORD_LENGTH 2
#define MAX_KEYWORD_LENGTH 10
- // Consume all alphanumeric characters.
- while (is_ascii_identifier_char(_peek())) {
+#ifdef DEBUG_ENABLED
+void GDScriptTokenizer::make_keyword_list() {
+#define KEYWORD_LINE(keyword, token_type) keyword,
+#define KEYWORD_GROUP_IGNORE(group)
+ keyword_list = {
+ KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE)
+ };
+#undef KEYWORD_LINE
+#undef KEYWORD_GROUP_IGNORE
+}
+#endif // DEBUG_ENABLED
+
+GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
+ // Consume all identifier characters.
+ while (is_unicode_identifier_continue(_peek())) {
_advance();
}
@@ -565,15 +583,28 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
}
// Not a keyword, so must be an identifier.
- return make_identifier(name);
+ Token id = make_identifier(name);
+
+#ifdef DEBUG_ENABLED
+ // Additional checks for identifiers but only in debug and if it's available in TextServer.
+ if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
+ int64_t confusable = TS->is_confusable(name, keyword_list);
+ if (confusable >= 0) {
+ push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
+ }
+ }
+#endif // DEBUG_ENABLED
+
+ return id;
-#undef KEYWORDS
-#undef MIN_KEYWORD_LENGTH
-#undef MAX_KEYWORD_LENGTH
#undef KEYWORD_GROUP_CASE
#undef KEYWORD
}
+#undef MAX_KEYWORD_LENGTH
+#undef MIN_KEYWORD_LENGTH
+#undef KEYWORDS
+
void GDScriptTokenizer::newline(bool p_make_token) {
// Don't overwrite previous newline, nor create if we want a line continuation.
if (p_make_token && !pending_newline && !line_continuation) {
@@ -720,7 +751,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() {
error.rightmost_column = column + 1;
push_error(error);
has_error = true;
- } else if (is_ascii_identifier_char(_peek())) {
+ } else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) {
// Letter at the end of the number.
push_error("Invalid numeric notation.");
}
@@ -1311,7 +1342,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
if (is_digit(c)) {
return number();
- } else if (is_ascii_identifier_char(c)) {
+ } else if (is_unicode_identifier_start(c)) {
return potential_identifier();
}
@@ -1504,7 +1535,11 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
}
default:
- return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
+ if (is_whitespace(c)) {
+ return make_error(vformat(R"(Invalid white space character "\\u%X".)", static_cast<int32_t>(c)));
+ } else {
+ return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
+ }
}
}
@@ -1514,4 +1549,7 @@ GDScriptTokenizer::GDScriptTokenizer() {
tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size");
}
#endif // TOOLS_ENABLED
+#ifdef DEBUG_ENABLED
+ make_keyword_list();
+#endif // DEBUG_ENABLED
}
diff --git a/modules/gdscript/gdscript_tokenizer.h b/modules/gdscript/gdscript_tokenizer.h
index 9588922122..608840d3f1 100644
--- a/modules/gdscript/gdscript_tokenizer.h
+++ b/modules/gdscript/gdscript_tokenizer.h
@@ -224,6 +224,9 @@ private:
char32_t indent_char = '\0';
int position = 0;
int length = 0;
+#ifdef DEBUG_ENABLED
+ Vector<String> keyword_list;
+#endif // DEBUG_ENABLED
#ifdef TOOLS_ENABLED
HashMap<int, CommentData> comments;
@@ -239,6 +242,10 @@ private:
void _skip_whitespace();
void check_indent();
+#ifdef DEBUG_ENABLED
+ void make_keyword_list();
+#endif // DEBUG_ENABLED
+
Token make_error(const String &p_message);
void push_error(const String &p_message);
void push_error(const Token &p_error);
diff --git a/modules/gdscript/gdscript_warning.cpp b/modules/gdscript/gdscript_warning.cpp
index 184cecb316..a6cbb7f6ae 100644
--- a/modules/gdscript/gdscript_warning.cpp
+++ b/modules/gdscript/gdscript_warning.cpp
@@ -155,6 +155,10 @@ String GDScriptWarning::get_message() const {
CHECK_SYMBOLS(2);
return vformat(R"(The function '%s()' is a static function but was called from an instance. Instead, it should be directly called from the type: '%s.%s()'.)", symbols[0], symbols[1], symbols[0]);
}
+ case CONFUSABLE_IDENTIFIER: {
+ CHECK_SYMBOLS(1);
+ return vformat(R"(The identifier "%s" has misleading characters and might be confused with something else.)", symbols[0]);
+ }
case WARNING_MAX:
break; // Can't happen, but silences warning
}
@@ -219,6 +223,7 @@ String GDScriptWarning::get_name_from_code(Code p_code) {
"SHADOWED_GLOBAL_IDENTIFIER",
"INT_ASSIGNED_TO_ENUM",
"STATIC_CALLED_ON_INSTANCE",
+ "CONFUSABLE_IDENTIFIER",
};
static_assert((sizeof(names) / sizeof(*names)) == WARNING_MAX, "Amount of warning types don't match the amount of warning names.");
diff --git a/modules/gdscript/gdscript_warning.h b/modules/gdscript/gdscript_warning.h
index e3aee45f33..b485f02b9c 100644
--- a/modules/gdscript/gdscript_warning.h
+++ b/modules/gdscript/gdscript_warning.h
@@ -78,6 +78,7 @@ public:
SHADOWED_GLOBAL_IDENTIFIER, // A global class or function has the same name as variable.
INT_ASSIGNED_TO_ENUM, // An integer value was assigned to an enum-typed variable without casting.
STATIC_CALLED_ON_INSTANCE, // A static method was called on an instance of a class instead of on the class itself.
+ CONFUSABLE_IDENTIFIER, // The identifier contains misleading characters that can be confused. E.g. "usеr" (has Cyrillic "е" instead of Latin "e").
WARNING_MAX,
};
diff --git a/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out b/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out
index b018091c18..32e230fc80 100644
--- a/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out
+++ b/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out
@@ -2,4 +2,4 @@ GDTEST_OK
>> WARNING
>> Line: 2
>> UNUSED_PARAMETER
->>
+>> The parameter 'unused' is never used in the function ''. If this is intended, prefix it with an underscore: '_unused'
diff --git a/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd
new file mode 100644
index 0000000000..4b1f284070
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd
@@ -0,0 +1,3 @@
+func test():
+ var аs # Using Cyrillic "а".
+ print(аs)
diff --git a/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out
new file mode 100644
index 0000000000..337dec2f4d
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out
@@ -0,0 +1,2 @@
+GDTEST_PARSER_ERROR
+Identifier "аs" is visually similar to the GDScript keyword "as" and thus not allowed.
diff --git a/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd
new file mode 100644
index 0000000000..523959a016
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd
@@ -0,0 +1,35 @@
+const π = PI
+var ㄥ = π
+
+func test():
+ var փորձարկում = "test"
+ prints("փորձարկում", փորձարկում)
+ var امتحان = "test"
+ prints("امتحان", امتحان)
+ var পরীক্ষা = "test"
+ prints("পরীক্ষা", পরীক্ষা)
+ var тест = "test"
+ prints("тест", тест)
+ var जाँच = "test"
+ prints("जाँच", जाँच)
+ var 기준 = "test"
+ prints("기준", 기준)
+ var 测试 = "test"
+ prints("测试", 测试)
+ var テスト = "test"
+ prints("テスト", テスト)
+ var 試験 = "test"
+ prints("試験", 試験)
+ var പരീക്ഷ = "test"
+ prints("പരീക്ഷ", പരീക്ഷ)
+ var ทดสอบ = "test"
+ prints("ทดสอบ", ทดสอบ)
+ var δοκιμή = "test"
+ prints("δοκιμή", δοκιμή)
+
+ const d = 1.1
+ _process(d)
+ print(is_equal_approx(ㄥ, PI + (d * PI)))
+
+func _process(Δ: float) -> void:
+ ㄥ += Δ * π
diff --git a/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out
new file mode 100644
index 0000000000..c071380a8f
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out
@@ -0,0 +1,14 @@
+GDTEST_OK
+փորձարկում test
+امتحان test
+পরীক্ষা test
+тест test
+जाँच test
+기준 test
+测试 test
+テスト test
+試験 test
+പരീക്ഷ test
+ทดสอบ test
+δοκιμή test
+true
diff --git a/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd
new file mode 100644
index 0000000000..e2caac8ffd
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd
@@ -0,0 +1,5 @@
+func test():
+ var port = 0 # Only latin characters.
+ var pοrt = 1 # The "ο" is Greek omicron.
+
+ prints(port, pοrt)
diff --git a/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out
new file mode 100644
index 0000000000..c483396443
--- /dev/null
+++ b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out
@@ -0,0 +1,6 @@
+GDTEST_OK
+>> WARNING
+>> Line: 3
+>> CONFUSABLE_IDENTIFIER
+>> The identifier "pοrt" has misleading characters and might be confused with something else.
+0 1