diff options
author | Rémi Verschelde <remi@verschelde.fr> | 2021-10-28 10:15:32 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-10-28 10:15:32 +0200 |
commit | c24bdfb327fd913d9706e020de1be2a6b79e4271 (patch) | |
tree | 482166808a72700a51e9e29b73e35abc7f279da0 | |
parent | 0ec77631979997b3e6bcd9146ea8f1c3e4166b81 (diff) | |
parent | 63f3051154a7e672956cffe41f90ed8d56a9ec23 (diff) |
Merge pull request #53737 from bruvzg/icu_strip_diacritics
-rw-r--r-- | doc/classes/TextServer.xml | 7 | ||||
-rw-r--r-- | modules/text_server_adv/text_server_adv.cpp | 33 | ||||
-rw-r--r-- | modules/text_server_adv/text_server_adv.h | 3 | ||||
-rw-r--r-- | servers/text_server.cpp | 137 | ||||
-rw-r--r-- | servers/text_server.h | 7 | ||||
-rw-r--r-- | tests/test_text_server.h | 23 |
6 files changed, 204 insertions, 6 deletions
diff --git a/doc/classes/TextServer.xml b/doc/classes/TextServer.xml index b10c7bcc96..3e32afe370 100644 --- a/doc/classes/TextServer.xml +++ b/doc/classes/TextServer.xml @@ -1233,6 +1233,13 @@ Aligns shaped text to the given tab-stops. </description> </method> + <method name="strip_diacritics" qualifiers="const"> + <return type="String" /> + <argument index="0" name="string" type="String" /> + <description> + Strips diacritics from the string. + </description> + </method> <method name="tag_to_name" qualifiers="const"> <return type="String" /> <argument index="0" name="tag" type="int" /> diff --git a/modules/text_server_adv/text_server_adv.cpp b/modules/text_server_adv/text_server_adv.cpp index c459141265..776134b598 100644 --- a/modules/text_server_adv/text_server_adv.cpp +++ b/modules/text_server_adv/text_server_adv.cpp @@ -4924,6 +4924,39 @@ String TextServerAdvanced::percent_sign(const String &p_language) const { return "%"; } +String TextServerAdvanced::strip_diacritics(const String &p_string) const { + UErrorCode err = U_ZERO_ERROR; + + // Get NFKD normalizer singleton. + const UNormalizer2 *unorm = unorm2_getNFKDInstance(&err); + ERR_FAIL_COND_V_MSG(U_FAILURE(err), TextServer::strip_diacritics(p_string), u_errorName(err)); + + // Convert to UTF-16. + Char16String utf16 = p_string.utf16(); + + // Normalize. + Char16String normalized; + err = U_ZERO_ERROR; + int32_t len = unorm2_normalize(unorm, utf16.ptr(), -1, nullptr, 0, &err); + ERR_FAIL_COND_V_MSG(err != U_BUFFER_OVERFLOW_ERROR, TextServer::strip_diacritics(p_string), u_errorName(err)); + normalized.resize(len); + err = U_ZERO_ERROR; + unorm2_normalize(unorm, utf16.ptr(), -1, normalized.ptrw(), len, &err); + ERR_FAIL_COND_V_MSG(U_FAILURE(err), TextServer::strip_diacritics(p_string), u_errorName(err)); + + // Convert back to UTF-32. + String normalized_string = String::utf16(normalized.ptr(), len); + + // Strip combining characters. + String result; + for (int i = 0; i < normalized_string.length(); i++) { + if (u_getCombiningClass(normalized_string[i]) == 0) { + result += normalized_string[i]; + } + } + return result; +} + TextServerAdvanced::TextServerAdvanced() { _insert_num_systems_lang(); _insert_feature_sets(); diff --git a/modules/text_server_adv/text_server_adv.h b/modules/text_server_adv/text_server_adv.h index 333b68e074..15f3a7f1a9 100644 --- a/modules/text_server_adv/text_server_adv.h +++ b/modules/text_server_adv/text_server_adv.h @@ -50,6 +50,7 @@ #include <unicode/udata.h> #include <unicode/uiter.h> #include <unicode/uloc.h> +#include <unicode/unorm2.h> #include <unicode/uscript.h> #include <unicode/ustring.h> #include <unicode/utypes.h> @@ -501,6 +502,8 @@ public: virtual String parse_number(const String &p_string, const String &p_language = "") const override; virtual String percent_sign(const String &p_language = "") const override; + virtual String strip_diacritics(const String &p_string) const override; + TextServerAdvanced(); ~TextServerAdvanced(); }; diff --git a/servers/text_server.cpp b/servers/text_server.cpp index 9b64661b0c..af4718678e 100644 --- a/servers/text_server.cpp +++ b/servers/text_server.cpp @@ -42,7 +42,7 @@ void TextServerManager::_bind_methods() { ClassDB::bind_method(D_METHOD("find_interface", "name"), &TextServerManager::find_interface); ClassDB::bind_method(D_METHOD("set_primary_interface", "index"), &TextServerManager::set_primary_interface); - ClassDB::bind_method(D_METHOD("get_primary_interface"), &TextServerManager::_get_primary_interface); + ClassDB::bind_method(D_METHOD("get_primary_interface"), &TextServerManager::get_primary_interface); ADD_SIGNAL(MethodInfo("interface_added", PropertyInfo(Variant::STRING_NAME, "interface_name"))); ADD_SIGNAL(MethodInfo("interface_removed", PropertyInfo(Variant::STRING_NAME, "interface_name"))); @@ -118,10 +118,6 @@ Array TextServerManager::get_interfaces() const { return ret; } -Ref<TextServer> TextServerManager::_get_primary_interface() const { - return primary_interface; -} - void TextServerManager::set_primary_interface(const Ref<TextServer> &p_primary_interface) { if (p_primary_interface.is_null()) { print_verbose("TextServer: Clearing primary interface"); @@ -407,6 +403,8 @@ void TextServer::_bind_methods() { ClassDB::bind_method(D_METHOD("parse_number", "number", "language"), &TextServer::parse_number, DEFVAL("")); ClassDB::bind_method(D_METHOD("percent_sign", "language"), &TextServer::percent_sign, DEFVAL("")); + ClassDB::bind_method(D_METHOD("strip_diacritics", "string"), &TextServer::strip_diacritics); + /* Direction */ BIND_ENUM_CONSTANT(DIRECTION_AUTO); BIND_ENUM_CONSTANT(DIRECTION_LTR); @@ -1317,6 +1315,134 @@ void TextServer::shaped_text_draw_outline(RID p_shaped, RID p_canvas, const Vect } } +void TextServer::_diacritics_map_add(const String &p_from, char32_t p_to) { + for (int i = 0; i < p_from.size(); i++) { + diacritics_map[p_from[i]] = p_to; + } +} + +void TextServer::_init_diacritics_map() { + diacritics_map.clear(); + + // Latin. + _diacritics_map_add(U"ÀÁÂÃÄÅĀĂĄǍǞǠǺȀȂȦḀẠẢẤẦẨẪẬẮẰẲẴẶ", U'A'); + _diacritics_map_add(U"àáâãäåāăąǎǟǡǻȁȃȧḁẚạảấầẩẫậắằẳẵặ", U'a'); + _diacritics_map_add(U"ǢǼ", U'Æ'); + _diacritics_map_add(U"ǣǽ", U'æ'); + _diacritics_map_add(U"ḂḄḆ", U'B'); + _diacritics_map_add(U"ḃḅḇ", U'b'); + _diacritics_map_add(U"ÇĆĈĊČḈ", U'C'); + _diacritics_map_add(U"çćĉċčḉ", U'c'); + _diacritics_map_add(U"ĎḊḌḎḐḒ", U'D'); + _diacritics_map_add(U"ďḋḍḏḑḓ", U'd'); + _diacritics_map_add(U"ÈÉÊËĒĔĖĘĚȆȨḔḖḘḚḜẸẺẼẾỀỂỄỆ", U'E'); + _diacritics_map_add(U"èéêëēĕėęěȇȩḕḗḙḛḝẹẻẽếềểễệ", U'e'); + _diacritics_map_add(U"Ḟ", U'F'); + _diacritics_map_add(U"ḟ", U'f'); + _diacritics_map_add(U"ĜĞĠĢǦǴḠ", U'G'); + _diacritics_map_add(U"ĝğġģǧǵḡ", U'g'); + _diacritics_map_add(U"ĤȞḢḤḦḨḪ", U'H'); + _diacritics_map_add(U"ĥȟḣḥḧḩḫẖ", U'h'); + _diacritics_map_add(U"ÌÍÎÏĨĪĬĮİǏȈȊḬḮỈỊ", U'I'); + _diacritics_map_add(U"ìíîïĩīĭįıǐȉȋḭḯỉị", U'i'); + _diacritics_map_add(U"Ĵ", U'J'); + _diacritics_map_add(U"ĵ", U'j'); + _diacritics_map_add(U"ĶǨḰḲḴ", U'K'); + _diacritics_map_add(U"ķĸǩḱḳḵ", U'k'); + _diacritics_map_add(U"ĹĻĽĿḶḸḺḼ", U'L'); + _diacritics_map_add(U"ĺļľŀḷḹḻḽ", U'l'); + _diacritics_map_add(U"ḾṀṂ", U'M'); + _diacritics_map_add(U"ḿṁṃ", U'm'); + _diacritics_map_add(U"ÑŃŅŇǸṄṆṈṊ", U'N'); + _diacritics_map_add(U"ñńņňʼnǹṅṇṉṋ", U'n'); + _diacritics_map_add(U"ÒÓÔÕÖŌŎŐƠǑǪǬȌȎȪȬȮȰṌṎṐṒỌỎỐỒỔỖỘỚỜỞỠỢ", U'O'); + _diacritics_map_add(U"òóôõöōŏőơǒǫǭȍȏȫȭȯȱṍṏṑṓọỏốồổỗộớờởỡợ", U'o'); + _diacritics_map_add(U"ṔṖ", U'P'); + _diacritics_map_add(U"ṗṕ", U'p'); + _diacritics_map_add(U"ŔŖŘȐȒṘṚṜṞ", U'R'); + _diacritics_map_add(U"ŕŗřȑȓṙṛṝṟ", U'r'); + _diacritics_map_add(U"ŚŜŞŠȘṠṢṤṦṨ", U'S'); + _diacritics_map_add(U"śŝşšſșṡṣṥṧṩẛẜẝ", U's'); + _diacritics_map_add(U"ŢŤȚṪṬṮṰ", U'T'); + _diacritics_map_add(U"ţťțṫṭṯṱẗ", U't'); + _diacritics_map_add(U"ÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖṲṴṶṸṺỤỦỨỪỬỮỰ", U'U'); + _diacritics_map_add(U"ùúûüũūŭůűųưǔǖǘǚǜȕȗṳṵṷṹṻụủứừửữự", U'u'); + _diacritics_map_add(U"ṼṾ", U'V'); + _diacritics_map_add(U"ṽṿ", U'v'); + _diacritics_map_add(U"ŴẀẂẄẆẈ", U'W'); + _diacritics_map_add(U"ŵẁẃẅẇẉẘ", U'w'); + _diacritics_map_add(U"ẊẌ", U'X'); + _diacritics_map_add(U"ẋẍ", U'x'); + _diacritics_map_add(U"ÝŶẎỲỴỶỸỾ", U'Y'); + _diacritics_map_add(U"ýÿŷẏẙỳỵỷỹỿ", U'y'); + _diacritics_map_add(U"ŹŻŽẐẒẔ", U'Z'); + _diacritics_map_add(U"źżžẑẓẕ", U'z'); + + // Greek. + _diacritics_map_add(U"ΆἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼ", U'Α'); + _diacritics_map_add(U"άἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ", U'α'); + _diacritics_map_add(U"ΈἘἙἚἛἜἝῈΈ", U'Ε'); + _diacritics_map_add(U"έἐἑἒἓἔἕὲέ", U'ε'); + _diacritics_map_add(U"ΉἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌ", U'Η'); + _diacritics_map_add(U"ήἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ", U'η'); + _diacritics_map_add(U"ΊΪἸἹἺἻἼἽἾἿῘῙῚΊ", U'Ι'); + _diacritics_map_add(U"ίΐϊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ", U'ι'); + _diacritics_map_add(U"ΌὈὉὊὋὌὍῸΌ", U'Ο'); + _diacritics_map_add(U"όὀὁὂὃὄὅὸό", U'ο'); + _diacritics_map_add(U"Ῥ", U'Ρ'); + _diacritics_map_add(U"ῤῥ", U'ρ'); + _diacritics_map_add(U"ΎΫϓϔὙὛὝὟῨῩῪΎ", U'Υ'); + _diacritics_map_add(U"ΰϋύὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ", U'υ'); + _diacritics_map_add(U"ΏὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼ", U'Ω'); + _diacritics_map_add(U"ώὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ", U'ω'); + + // Cyrillic. + _diacritics_map_add(U"ӐӒ", U'А'); + _diacritics_map_add(U"ӑӓ", U'а'); + _diacritics_map_add(U"ЀЁӖ", U'Е'); + _diacritics_map_add(U"ѐёӗ", U'е'); + _diacritics_map_add(U"Ӛ", U'Ә'); + _diacritics_map_add(U"ӛ", U'ә'); + _diacritics_map_add(U"Ӝ", U'Ж'); + _diacritics_map_add(U"ӝ", U'ж'); + _diacritics_map_add(U"Ӟ", U'З'); + _diacritics_map_add(U"ӟ", U'з'); + _diacritics_map_add(U"Ѓ", U'Г'); + _diacritics_map_add(U"ѓ", U'г'); + _diacritics_map_add(U"Ї", U'І'); + _diacritics_map_add(U"ї", U'і'); + _diacritics_map_add(U"ЍӢӤЙ", U'И'); + _diacritics_map_add(U"ѝӣӥй", U'и'); + _diacritics_map_add(U"Ќ", U'К'); + _diacritics_map_add(U"ќ", U'к'); + _diacritics_map_add(U"Ӧ", U'О'); + _diacritics_map_add(U"ӧ", U'о'); + _diacritics_map_add(U"Ӫ", U'Ө'); + _diacritics_map_add(U"ӫ", U'ө'); + _diacritics_map_add(U"Ӭ", U'Э'); + _diacritics_map_add(U"ӭ", U'э'); + _diacritics_map_add(U"ЎӮӰӲ", U'У'); + _diacritics_map_add(U"ўӯӱӳ", U'у'); + _diacritics_map_add(U"Ӵ", U'Ч'); + _diacritics_map_add(U"ӵ", U'ч'); + _diacritics_map_add(U"Ӹ", U'Ы'); + _diacritics_map_add(U"ӹ", U'ы'); +} + +String TextServer::strip_diacritics(const String &p_string) const { + String result; + for (int i = 0; i < p_string.length(); i++) { + if (p_string[i] < 0x02B0 || p_string[i] > 0x036F) { // Skip combining diacritics. + if (diacritics_map.has(p_string[i])) { + result += diacritics_map[p_string[i]]; + } else { + result += p_string[i]; + } + } + } + return result; +} + Array TextServer::_shaped_text_get_glyphs_wrapper(RID p_shaped) const { Array ret; @@ -1393,6 +1519,7 @@ Array TextServer::_shaped_text_get_ellipsis_glyphs_wrapper(RID p_shaped) const { } TextServer::TextServer() { + _init_diacritics_map(); } TextServer::~TextServer() { diff --git a/servers/text_server.h b/servers/text_server.h index 3a5f946fbf..a5484d8fbd 100644 --- a/servers/text_server.h +++ b/servers/text_server.h @@ -194,6 +194,10 @@ protected: Vector<Glyph> glyphs_logical; }; + Map<char32_t, char32_t> diacritics_map; + void _diacritics_map_add(const String &p_from, char32_t p_to); + void _init_diacritics_map(); + static void _bind_methods(); public: @@ -427,6 +431,8 @@ public: virtual String parse_number(const String &p_string, const String &p_language = "") const { return p_string; }; virtual String percent_sign(const String &p_language = "") const { return "%"; }; + virtual String strip_diacritics(const String &p_string) const; + TextServer(); ~TextServer(); }; @@ -509,7 +515,6 @@ public: _FORCE_INLINE_ Ref<TextServer> get_primary_interface() const { return primary_interface; } - Ref<TextServer> _get_primary_interface() const; void set_primary_interface(const Ref<TextServer> &p_primary_interface); TextServerManager(); diff --git a/tests/test_text_server.h b/tests/test_text_server.h index af3df7bc79..4edffe3711 100644 --- a/tests/test_text_server.h +++ b/tests/test_text_server.h @@ -265,6 +265,29 @@ TEST_SUITE("[[TextServer]") { font.clear(); } } + + SUBCASE("[TextServer] Strip Diacritics") { + for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) { + Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i); + TEST_FAIL_COND(ts.is_null(), "Invalid TS interface."); + + if (ts->has_feature(TextServer::FEATURE_SHAPING)) { + CHECK(ts->strip_diacritics(U"ٱلسَّلَامُ عَلَيْكُمْ") == U"ٱلسلام عليكم"); + } + + CHECK(ts->strip_diacritics(U"pêches épinards tomates fraises") == U"peches epinards tomates fraises"); + CHECK(ts->strip_diacritics(U"ΆΈΉΊΌΎΏΪΫϓϔ") == U"ΑΕΗΙΟΥΩΙΥΥΥ"); + CHECK(ts->strip_diacritics(U"άέήίΐϊΰϋόύώ") == U"αεηιιιυυουω"); + CHECK(ts->strip_diacritics(U"ЀЁЃ ЇЌЍӢӤЙ ЎӮӰӲ ӐӒӖӚӜӞ ӦӪ Ӭ Ӵ Ӹ") == U"ЕЕГ ІКИИИИ УУУУ ААЕӘЖЗ ОӨ Э Ч Ы"); + CHECK(ts->strip_diacritics(U"ѐёѓ їќѝӣӥй ўӯӱӳ ӑӓӗӛӝӟ ӧӫ ӭ ӵ ӹ") == U"еег ікииии уууу ааеәжз оө э ч ы"); + CHECK(ts->strip_diacritics(U"ÀÁÂÃÄÅĀĂĄÇĆĈĊČĎÈÉÊËĒĔĖĘĚĜĞĠĢĤÌÍÎÏĨĪĬĮİĴĶĹĻĽÑŃŅŇŊÒÓÔÕÖØŌŎŐƠŔŖŘŚŜŞŠŢŤÙÚÛÜŨŪŬŮŰŲƯŴÝŶŹŻŽ") == U"AAAAAAAAACCCCCDEEEEEEEEEGGGGHIIIIIIIIIJKLLLNNNNŊOOOOOØOOOORRRSSSSTTUUUUUUUUUUUWYYZZZ"); + CHECK(ts->strip_diacritics(U"àáâãäåāăąçćĉċčďèéêëēĕėęěĝğġģĥìíîïĩīĭįĵķĺļľñńņňŋòóôõöøōŏőơŕŗřśŝşšţťùúûüũūŭůűųưŵýÿŷźżž") == U"aaaaaaaaacccccdeeeeeeeeegggghiiiiiiiijklllnnnnŋoooooøoooorrrssssttuuuuuuuuuuuwyyyzzz"); + CHECK(ts->strip_diacritics(U"ǍǏȈǑǪǬȌȎȪȬȮȰǓǕǗǙǛȔȖǞǠǺȀȂȦǢǼǦǴǨǸȆȐȒȘȚȞȨ Ḁ ḂḄḆ Ḉ ḊḌḎḐḒ ḔḖḘḚḜ Ḟ Ḡ ḢḤḦḨḪ ḬḮ ḰḲḴ ḶḸḺḼ ḾṀṂ ṄṆṈṊ ṌṎṐṒ ṔṖ ṘṚṜṞ ṠṢṤṦṨ ṪṬṮṰ ṲṴṶṸṺ") == U"AIIOOOOOOOOOUUUUUUUAAAAAAÆÆGGKNERRSTHE A BBB C DDDDD EEEEE F G HHHHH II KKK LLLL MMM NNNN OOOO PP RRRR SSSSS TTTT UUUUU"); + CHECK(ts->strip_diacritics(U"ǎǐȉȋǒǫǭȍȏȫȭȯȱǔǖǘǚǜȕȗǟǡǻȁȃȧǣǽǧǵǩǹȇȑȓșțȟȩ ḁ ḃḅḇ ḉ ḋḍḏḑḓ ḟ ḡ ḭḯ ḱḳḵ ḷḹḻḽ ḿṁṃ ṅṇṉṋ ṍṏṑṓ ṗṕ ṙṛṝṟ ṡṣṥṧṩ ṫṭṯṱ ṳṵṷṹṻ") == U"aiiiooooooooouuuuuuuaaaaaaææggknerrsthe a bbb c ddddd f g ii kkk llll mmm nnnn oooo pp rrrr sssss tttt uuuuu"); + CHECK(ts->strip_diacritics(U"ṼṾ ẀẂẄẆẈ ẊẌ Ẏ ẐẒẔ") == U"VV WWWWW XX Y ZZZ"); + CHECK(ts->strip_diacritics(U"ṽṿ ẁẃẅẇẉ ẋẍ ẏ ẑẓẕ ẖ ẗẘẙẛ") == U"vv wwwww xx y zzz h twys"); + } + } } } }; // namespace TestTextServer |