From 5aa48b6ae5f44f16b3bf62c7c1c3501295132142 Mon Sep 17 00:00:00 2001 From: bruvzg <7645683+bruvzg@users.noreply.github.com> Date: Mon, 18 Oct 2021 15:07:11 +0300 Subject: [TextServer] Implement ICU/UAX 31 based `is_valid_identifier` function. --- modules/text_server_adv/text_server_adv.cpp | 186 ++++++++++++++++++++++++++++ modules/text_server_adv/text_server_adv.h | 1 + 2 files changed, 187 insertions(+) (limited to 'modules/text_server_adv') diff --git a/modules/text_server_adv/text_server_adv.cpp b/modules/text_server_adv/text_server_adv.cpp index 64788bfeff..f8dbbc2e61 100644 --- a/modules/text_server_adv/text_server_adv.cpp +++ b/modules/text_server_adv/text_server_adv.cpp @@ -346,6 +346,7 @@ bool TextServerAdvanced::has_feature(Feature p_feature) const { case FEATURE_FONT_VARIABLE: case FEATURE_CONTEXT_SENSITIVE_CASE_CONVERSION: case FEATURE_USE_SUPPORT_DATA: + case FEATURE_UNICODE_IDENTIFIERS: return true; default: { } @@ -5757,6 +5758,191 @@ PackedInt32Array TextServerAdvanced::string_get_word_breaks(const String &p_stri return ret; } +bool TextServerAdvanced::is_valid_identifier(const String &p_string) const { + enum UAX31SequenceStatus { + SEQ_NOT_STARTED, + SEQ_STARTED, + SEQ_STARTED_VIR, + SEQ_NEAR_END, + }; + + const char32_t *str = p_string.ptr(); + int len = p_string.length(); + + if (len == 0) { + return false; // Empty string. + } + + UErrorCode err = U_ZERO_ERROR; + Char16String utf16 = p_string.utf16(); + const UNormalizer2 *norm_c = unorm2_getNFCInstance(&err); + if (U_FAILURE(err)) { + return false; // Failed to load normalizer. + } + bool isnurom = unorm2_isNormalized(norm_c, utf16.ptr(), utf16.length(), &err); + if (U_FAILURE(err) || !isnurom) { + return false; // Do not conform to Normalization Form C. + } + + UAX31SequenceStatus A1_sequence_status = SEQ_NOT_STARTED; + UScriptCode A1_scr = USCRIPT_INHERITED; + UAX31SequenceStatus A2_sequence_status = SEQ_NOT_STARTED; + UScriptCode A2_scr = USCRIPT_INHERITED; + UAX31SequenceStatus B_sequence_status = SEQ_NOT_STARTED; + UScriptCode B_scr = USCRIPT_INHERITED; + + for (int i = 0; i < len; i++) { + err = U_ZERO_ERROR; + UScriptCode scr = uscript_getScript(str[i], &err); + if (U_FAILURE(err)) { + return false; // Invalid script. + } + if (uscript_getUsage(scr) != USCRIPT_USAGE_RECOMMENDED) { + return false; // Not a recommended script. + } + uint8_t cat = u_charType(str[i]); + int32_t jt = u_getIntPropertyValue(str[i], UCHAR_JOINING_TYPE); + + // UAX #31 section 2.3 subsections A1, A2 and B, check ZWNJ and ZWJ usage. + switch (A1_sequence_status) { + case SEQ_NEAR_END: { + if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) { + return false; // Mixed script. + } + if (jt == U_JT_RIGHT_JOINING || jt == U_JT_DUAL_JOINING) { + A1_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset. + } else if (jt != U_JT_TRANSPARENT) { + return false; // Invalid end of sequence. + } + } break; + case SEQ_STARTED: { + if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) { + A1_sequence_status = SEQ_NOT_STARTED; // Reset. + } else { + if (jt != U_JT_TRANSPARENT) { + if (str[i] == 0x200C /*ZWNJ*/) { + A1_sequence_status = SEQ_NEAR_END; + continue; + } else { + A1_sequence_status = SEQ_NOT_STARTED; // Reset. + } + } + } + } break; + default: + break; + } + if (A1_sequence_status == SEQ_NOT_STARTED) { + if (jt == U_JT_LEFT_JOINING || jt == U_JT_DUAL_JOINING) { + A1_sequence_status = SEQ_STARTED; + A1_scr = scr; + } + }; + + switch (A2_sequence_status) { + case SEQ_NEAR_END: { + if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) { + return false; // Mixed script. + } + if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) { + A2_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset. + } else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) { + return false; // Invalid end of sequence. + } + } break; + case SEQ_STARTED_VIR: { + if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) { + A2_sequence_status = SEQ_NOT_STARTED; // Reset. + } else { + if (str[i] == 0x200C /*ZWNJ*/) { + A2_sequence_status = SEQ_NEAR_END; + continue; + } else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) { + A2_sequence_status = SEQ_NOT_STARTED; // Reset. + } + } + } break; + case SEQ_STARTED: { + if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) { + A2_sequence_status = SEQ_NOT_STARTED; // Reset. + } else { + if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) { + A2_sequence_status = SEQ_STARTED_VIR; + } else if (cat != U_MODIFIER_LETTER) { + A2_sequence_status = SEQ_NOT_STARTED; // Reset. + } + } + } break; + default: + break; + } + if (A2_sequence_status == SEQ_NOT_STARTED) { + if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) { + A2_sequence_status = SEQ_STARTED; + A2_scr = scr; + } + } + + switch (B_sequence_status) { + case SEQ_NEAR_END: { + if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) { + return false; // Mixed script. + } + if (u_getIntPropertyValue(str[i], UCHAR_INDIC_SYLLABIC_CATEGORY) != U_INSC_VOWEL_DEPENDENT) { + B_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset. + } else { + return false; // Invalid end of sequence. + } + } break; + case SEQ_STARTED_VIR: { + if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) { + B_sequence_status = SEQ_NOT_STARTED; // Reset. + } else { + if (str[i] == 0x200D /*ZWJ*/) { + B_sequence_status = SEQ_NEAR_END; + continue; + } else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) { + B_sequence_status = SEQ_NOT_STARTED; // Reset. + } + } + } break; + case SEQ_STARTED: { + if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) { + B_sequence_status = SEQ_NOT_STARTED; // Reset. + } else { + if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) { + B_sequence_status = SEQ_STARTED_VIR; + } else if (cat != U_MODIFIER_LETTER) { + B_sequence_status = SEQ_NOT_STARTED; // Reset. + } + } + } break; + default: + break; + } + if (B_sequence_status == SEQ_NOT_STARTED) { + if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) { + B_sequence_status = SEQ_STARTED; + B_scr = scr; + } + } + + if (u_hasBinaryProperty(str[i], UCHAR_PATTERN_SYNTAX) || u_hasBinaryProperty(str[i], UCHAR_PATTERN_WHITE_SPACE) || u_hasBinaryProperty(str[i], UCHAR_NONCHARACTER_CODE_POINT)) { + return false; // Not a XID_Start or XID_Continue character. + } + if (i == 0) { + if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || str[0] == 0x2118 || str[0] == 0x212E || str[0] == 0x309B || str[0] == 0x309C || str[0] == 0x005F)) { + return false; // Not a XID_Start character. + } + } else { + if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || cat == U_NON_SPACING_MARK || cat == U_COMBINING_SPACING_MARK || cat == U_DECIMAL_DIGIT_NUMBER || cat == U_CONNECTOR_PUNCTUATION || str[i] == 0x2118 || str[i] == 0x212E || str[i] == 0x309B || str[i] == 0x309C || str[i] == 0x1369 || str[i] == 0x1371 || str[i] == 0x00B7 || str[i] == 0x0387 || str[i] == 0x19DA || str[i] == 0x0E33 || str[i] == 0x0EB3 || str[i] == 0xFF9E || str[i] == 0xFF9F)) { + return false; // Not a XID_Continue character. + } + } + } + return true; +} + TextServerAdvanced::TextServerAdvanced() { _insert_num_systems_lang(); _insert_feature_sets(); diff --git a/modules/text_server_adv/text_server_adv.h b/modules/text_server_adv/text_server_adv.h index 8cd0e753ba..170d92f8fc 100644 --- a/modules/text_server_adv/text_server_adv.h +++ b/modules/text_server_adv/text_server_adv.h @@ -702,6 +702,7 @@ public: virtual PackedInt32Array string_get_word_breaks(const String &p_string, const String &p_language = "") const override; virtual String strip_diacritics(const String &p_string) const override; + virtual bool is_valid_identifier(const String &p_string) const override; virtual String string_to_upper(const String &p_string, const String &p_language = "") const override; virtual String string_to_lower(const String &p_string, const String &p_language = "") const override; -- cgit v1.2.3