diff options
Diffstat (limited to 'thirdparty/icu4c')
47 files changed, 4930 insertions, 299 deletions
diff --git a/thirdparty/icu4c/LICENSE b/thirdparty/icu4c/LICENSE index 970ae074cb..80b587723a 100644 --- a/thirdparty/icu4c/LICENSE +++ b/thirdparty/icu4c/LICENSE @@ -1,6 +1,19 @@ -COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE -Copyright © 1991-2020 Unicode, Inc. All rights reserved. +See Terms of Use <https://www.unicode.org/copyright.html> +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in https://www.unicode.org/copyright.html. Permission is hereby granted, free of charge, to any person obtaining @@ -32,7 +45,7 @@ shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder. ---------------------- +---------------------------------------------------------------------- Third-Party Software Licenses @@ -40,7 +53,9 @@ This section contains third-party software notices and/or additional terms for licensed third-party software components included within ICU libraries. -1. ICU License - ICU 1.8.1 to ICU 57.1 +---------------------------------------------------------------------- + +ICU License - ICU 1.8.1 to ICU 57.1 COPYRIGHT AND PERMISSION NOTICE @@ -75,7 +90,9 @@ of the copyright holder. All trademarks and registered trademarks mentioned herein are the property of their respective owners. -2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt) +---------------------------------------------------------------------- + +Chinese/Japanese Word Break Dictionary Data (cjdict.txt) # The Google Chrome software developed by Google is licensed under # the BSD license. Other software included in this distribution is @@ -279,7 +296,9 @@ property of their respective owners. # # ---------------COPYING.ipadic-----END---------------------------------- -3. Lao Word Break Dictionary Data (laodict.txt) +---------------------------------------------------------------------- + +Lao Word Break Dictionary Data (laodict.txt) # Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html @@ -319,7 +338,9 @@ property of their respective owners. # OF THE POSSIBILITY OF SUCH DAMAGE. # -------------------------------------------------------------------------- -4. Burmese Word Break Dictionary Data (burmesedict.txt) +---------------------------------------------------------------------- + +Burmese Word Break Dictionary Data (burmesedict.txt) # Copyright (c) 2014 International Business Machines Corporation # and others. All Rights Reserved. @@ -359,7 +380,9 @@ property of their respective owners. # SUCH DAMAGE. # -------------------------------------------------------------------------- -5. Time Zone Database +---------------------------------------------------------------------- + +Time Zone Database ICU uses the public domain data and code derived from Time Zone Database for its time zone support. The ownership of the TZ database @@ -382,7 +405,9 @@ Database section 7. # making a contribution to the database or code waives all rights to # future claims in that contribution or in the TZ Database. -6. Google double-conversion +---------------------------------------------------------------------- + +Google double-conversion Copyright 2006-2011, the V8 project authors. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -410,3 +435,85 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +File: aclocal.m4 (only for ICU4C) +Section: pkg.m4 - Macros to locate and utilise pkg-config. + + +Copyright © 2004 Scott James Remnant <scott@netsplit.com>. +Copyright © 2012-2015 Dan Nicholson <dbn.lists@gmail.com> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + +As a special exception to the GNU General Public License, if you +distribute this file as part of a program that contains a +configuration script generated by Autoconf, you may include it under +the same distribution terms that you use for the rest of that +program. + + +(The condition for the exception is fulfilled because +ICU4C includes a configuration script generated by Autoconf, +namely the `configure` script.) + +---------------------------------------------------------------------- + +File: config.guess (only for ICU4C) + + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, see <https://www.gnu.org/licenses/>. + +As a special exception to the GNU General Public License, if you +distribute this file as part of a program that contains a +configuration script generated by Autoconf, you may include it under +the same distribution terms that you use for the rest of that +program. This Exception is an additional permission under section 7 +of the GNU General Public License, version 3 ("GPLv3"). + + +(The condition for the exception is fulfilled because +ICU4C includes a configuration script generated by Autoconf, +namely the `configure` script.) + +---------------------------------------------------------------------- + +File: install-sh (only for ICU4C) + + +Copyright 1991 by the Massachusetts Institute of Technology + +Permission to use, copy, modify, distribute, and sell this software and its +documentation for any purpose is hereby granted without fee, provided that +the above copyright notice appear in all copies and that both that +copyright notice and this permission notice appear in supporting +documentation, and that the name of M.I.T. not be used in advertising or +publicity pertaining to distribution of the software without specific, +written prior permission. M.I.T. makes no representations about the +suitability of this software for any purpose. It is provided "as is" +without express or implied warranty. diff --git a/thirdparty/icu4c/common/brkeng.cpp b/thirdparty/icu4c/common/brkeng.cpp index 52e9c53621..dc9fb99bf1 100644 --- a/thirdparty/icu4c/common/brkeng.cpp +++ b/thirdparty/icu4c/common/brkeng.cpp @@ -79,6 +79,7 @@ UnhandledEngine::findBreaks( UText *text, int32_t /* startPos */, int32_t endPos, UVector32 &/*foundBreaks*/, + UBool /* isPhraseBreaking */, UErrorCode &status) const { if (U_FAILURE(status)) return 0; UChar32 c = utext_current32(text); diff --git a/thirdparty/icu4c/common/brkeng.h b/thirdparty/icu4c/common/brkeng.h index 6843f1cc95..127ba59e18 100644 --- a/thirdparty/icu4c/common/brkeng.h +++ b/thirdparty/icu4c/common/brkeng.h @@ -75,6 +75,7 @@ class LanguageBreakEngine : public UMemory { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode &status) const = 0; }; @@ -194,6 +195,7 @@ class UnhandledEngine : public LanguageBreakEngine { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode &status) const override; /** diff --git a/thirdparty/icu4c/common/brkiter.cpp b/thirdparty/icu4c/common/brkiter.cpp index 8b228acf2c..8a1915880e 100644 --- a/thirdparty/icu4c/common/brkiter.cpp +++ b/thirdparty/icu4c/common/brkiter.cpp @@ -30,6 +30,7 @@ #include "unicode/ures.h" #include "unicode/ustring.h" #include "unicode/filteredbrk.h" +#include "bytesinkutil.h" #include "ucln_cmn.h" #include "cstring.h" #include "umutex.h" @@ -115,7 +116,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st } // Create a RuleBasedBreakIterator - result = new RuleBasedBreakIterator(file, status); + result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != NULL, status); // If there is a result, set the valid locale and actual locale, and the kind if (U_SUCCESS(status) && result != NULL) { @@ -408,7 +409,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) if (U_FAILURE(status)) { return NULL; } - char lbType[kKeyValueLenMax]; BreakIterator *result = NULL; switch (kind) { @@ -428,18 +428,29 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) break; case UBRK_LINE: { + char lb_lw[kKeyValueLenMax]; UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE); - uprv_strcpy(lbType, "line"); - char lbKeyValue[kKeyValueLenMax] = {0}; + uprv_strcpy(lb_lw, "line"); UErrorCode kvStatus = U_ZERO_ERROR; - int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus); - if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) { - uprv_strcat(lbType, "_"); - uprv_strcat(lbType, lbKeyValue); + CharString value; + CharStringByteSink valueSink(&value); + loc.getKeywordValue("lb", valueSink, kvStatus); + if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) { + uprv_strcat(lb_lw, "_"); + uprv_strcat(lb_lw, value.data()); } - result = BreakIterator::buildInstance(loc, lbType, status); + // lw=phrase is only supported in Japanese. + if (uprv_strcmp(loc.getLanguage(), "ja") == 0) { + value.clear(); + loc.getKeywordValue("lw", valueSink, kvStatus); + if (U_SUCCESS(kvStatus) && value == "phrase") { + uprv_strcat(lb_lw, "_"); + uprv_strcat(lb_lw, value.data()); + } + } + result = BreakIterator::buildInstance(loc, lb_lw, status); - UTRACE_DATA1(UTRACE_INFO, "lb=%s", lbKeyValue); + UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw); UTRACE_EXIT_STATUS(status); } break; diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp index 4d158e3226..4fdbdf2760 100644 --- a/thirdparty/icu4c/common/dictbe.cpp +++ b/thirdparty/icu4c/common/dictbe.cpp @@ -17,7 +17,10 @@ #include "dictbe.h" #include "unicode/uniset.h" #include "unicode/chariter.h" +#include "unicode/resbund.h" #include "unicode/ubrk.h" +#include "unicode/usetiter.h" +#include "ubrkimpl.h" #include "utracimp.h" #include "uvectr32.h" #include "uvector.h" @@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; (void)startPos; // TODO: remove this param? @@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text, } rangeStart = start; rangeEnd = current; - result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status); + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status); utext_setNativeIndex(text, current); return result; @@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai"); - fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); + UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fThaiWordSet); + setCharacters(thaiWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fThaiWordSet; + fEndWordSet = thaiWordSet; fEndWordSet.remove(0x0E31); // MAI HAN-AKAT fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK @@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; utext_setNativeIndex(text, rangeStart); @@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo"); - fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); + UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fLaoWordSet); + setCharacters(laoWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fLaoWordSet; + fEndWordSet = laoWordSet; fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) @@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { @@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr"); - fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); + fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels + fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.add(0x0020); if (U_SUCCESS(status)) { - setCharacters(fBurmeseWordSet); + setCharacters(fEndWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); - fMarkSet.add(0x0020); - fEndWordSet = fBurmeseWordSet; - fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching. fMarkSet.compact(); @@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { @@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); + UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fKhmerWordSet); + setCharacters(khmerWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fKhmerWordSet; + fEndWordSet = khmerWordSet; fBeginWordSet.add(0x1780, 0x17B3); //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word @@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { @@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType : DictionaryBreakEngine(), fDictionary(adoptDictionary) { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani"); - // Korean dictionary only includes Hangul syllables - fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); - fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); - fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); - fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); nfkcNorm2 = Normalizer2::getNFKCInstance(status); - - if (U_SUCCESS(status)) { - // handle Korean and Japanese/Chinese using different dictionaries - if (type == kKorean) { + // Korean dictionary only includes Hangul syllables + fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status); + fHangulWordSet.compact(); + // Digits, open puncutation and Alphabetic characters. + fDigitOrOpenPunctuationOrAlphabetSet.applyPattern( + UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status); + fDigitOrOpenPunctuationOrAlphabetSet.compact(); + fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status); + fClosePunctuationSet.compact(); + + // handle Korean and Japanese/Chinese using different dictionaries + if (type == kKorean) { + if (U_SUCCESS(status)) { setCharacters(fHangulWordSet); - } else { //Chinese and Japanese - UnicodeSet cjSet; - cjSet.addAll(fHanWordSet); - cjSet.addAll(fKatakanaWordSet); - cjSet.addAll(fHiraganaWordSet); - cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK - cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK + } + } else { //Chinese and Japanese + UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status); + if (U_SUCCESS(status)) { setCharacters(cjSet); + initJapanesePhraseParameter(status); } } UTRACE_EXIT_STATUS(status); @@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) { (value >= 0xFF66 && value <= 0xFF9f); } - // Function for accessing internal utext flags. // Replicates an internal UText function. static inline int32_t utext_i32_flag(int32_t bitIndex) { return (int32_t)1 << bitIndex; } - /* * @param text A UText representing the text @@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if (rangeStart >= rangeEnd) { @@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { t_boundary.addElement(numCodePts, status); numBreaks++; + } else if (isPhraseBreaking) { + t_boundary.addElement(numCodePts, status); + if(U_SUCCESS(status)) { + numBreaks++; + int32_t prevIdx = numCodePts; + + int32_t codeUnitIdx = -1; + int32_t prevCodeUnitIdx = -1; + int32_t length = -1; + for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) { + codeUnitIdx = inString.moveIndex32(0, i); + prevCodeUnitIdx = inString.moveIndex32(0, prevIdx); + // Calculate the length by using the code unit. + length = prevCodeUnitIdx - codeUnitIdx; + prevIdx = i; + // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana + // characters don't occur. + if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length)) + && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1))) + || !isKatakana(inString.char32At(codeUnitIdx)))) { + t_boundary.addElement(i, status); + numBreaks++; + } + } + } } else { for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { t_boundary.addElement(i, status); @@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, // while reversing t_boundary and pushing values to foundBreaks. int32_t prevCPPos = -1; int32_t prevUTextPos = -1; - for (int32_t i = numBreaks-1; i >= 0; i--) { + int32_t correctedNumBreaks = 0; + for (int32_t i = numBreaks - 1; i >= 0; i--) { int32_t cpPos = t_boundary.elementAti(i); U_ASSERT(cpPos > prevCPPos); int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; @@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if (utextPos > prevUTextPos) { // Boundaries are added to foundBreaks output in ascending order. U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); - foundBreaks.push(utextPos, status); + // In phrase breaking, there has to be a breakpoint between Cj character and close + // punctuation. + // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 + if (utextPos != rangeStart + || (isPhraseBreaking && utextPos > 0 + && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) { + foundBreaks.push(utextPos, status); + correctedNumBreaks++; + } } else { // Normalization expanded the input text, the dictionary found a boundary // within the expansion, giving two boundaries with the same index in the @@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } (void)prevCPPos; // suppress compiler warnings about unused variable + UChar32 nextChar = utext_char32At(inText, rangeEnd); + if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { + // In phrase breaking, there has to be a breakpoint between Cj character and + // the number/open punctuation. + // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 + // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 + // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U + if (isPhraseBreaking) { + if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) { + foundBreaks.popi(); + correctedNumBreaks--; + } + } else { + foundBreaks.popi(); + correctedNumBreaks--; + } + } + // inString goes out of scope // inputMap goes out of scope - return numBreaks; + return correctedNumBreaks; +} + +void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) { + loadJapaneseExtensions(error); + loadHiragana(error); +} + +void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) { + const char* tag = "extensions"; + ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error); + if (U_SUCCESS(error)) { + ResourceBundle bundle = ja.get(tag, error); + while (U_SUCCESS(error) && bundle.hasNext()) { + fSkipSet.puti(bundle.getNextString(error), 1, error); + } + } +} + +void CjkBreakEngine::loadHiragana(UErrorCode& error) { + UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error); + hiraganaWordSet.compact(); + UnicodeSetIterator iterator(hiraganaWordSet); + while (iterator.next()) { + fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error); + } } #endif diff --git a/thirdparty/icu4c/common/dictbe.h b/thirdparty/icu4c/common/dictbe.h index 4e70ed3817..ca1a3c28b7 100644 --- a/thirdparty/icu4c/common/dictbe.h +++ b/thirdparty/icu4c/common/dictbe.h @@ -15,6 +15,7 @@ #include "unicode/utext.h" #include "brkeng.h" +#include "hash.h" #include "uvectr32.h" U_NAMESPACE_BEGIN @@ -80,6 +81,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status ) const override; protected: @@ -105,6 +107,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const = 0; }; @@ -127,7 +130,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fThaiWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fSuffixSet; @@ -164,6 +166,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -186,7 +189,6 @@ class LaoBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fLaoWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -222,6 +224,7 @@ class LaoBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -244,7 +247,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fBurmeseWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -280,6 +282,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -302,7 +305,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fKhmerWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -338,6 +340,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -366,13 +369,22 @@ class CjkBreakEngine : public DictionaryBreakEngine { * @internal */ UnicodeSet fHangulWordSet; - UnicodeSet fHanWordSet; - UnicodeSet fKatakanaWordSet; - UnicodeSet fHiraganaWordSet; + UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; + UnicodeSet fClosePunctuationSet; DictionaryMatcher *fDictionary; const Normalizer2 *nfkcNorm2; + private: + // Load Japanese extensions. + void loadJapaneseExtensions(UErrorCode& error); + // Load Japanese Hiragana. + void loadHiragana(UErrorCode& error); + // Initialize fSkipSet by loading Japanese Hiragana and extensions. + void initJapanesePhraseParameter(UErrorCode& error); + + Hashtable fSkipSet; + public: /** @@ -404,6 +416,7 @@ class CjkBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; diff --git a/thirdparty/icu4c/common/localematcher.cpp b/thirdparty/icu4c/common/localematcher.cpp index 3d178dfbaf..2cad708d99 100644 --- a/thirdparty/icu4c/common/localematcher.cpp +++ b/thirdparty/icu4c/common/localematcher.cpp @@ -168,12 +168,9 @@ void LocaleMatcher::Builder::clearSupportedLocales() { bool LocaleMatcher::Builder::ensureSupportedLocaleVector() { if (U_FAILURE(errorCode_)) { return false; } if (supportedLocales_ != nullptr) { return true; } - supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_); + LocalPointer<UVector> lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_); if (U_FAILURE(errorCode_)) { return false; } - if (supportedLocales_ == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - return false; - } + supportedLocales_ = lpSupportedLocales.orphan(); return true; } @@ -187,9 +184,8 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin for (int32_t i = 0; i < length; ++i) { Locale *locale = list.orphanLocaleAt(i); if (locale == nullptr) { continue; } - supportedLocales_->addElementX(locale, errorCode_); + supportedLocales_->adoptElement(locale, errorCode_); if (U_FAILURE(errorCode_)) { - delete locale; break; } } @@ -197,35 +193,21 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin } LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) { - if (U_FAILURE(errorCode_)) { return *this; } - clearSupportedLocales(); - if (!ensureSupportedLocaleVector()) { return *this; } - while (locales.hasNext()) { - const Locale &locale = locales.next(); - Locale *clone = locale.clone(); - if (clone == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - break; - } - supportedLocales_->addElementX(clone, errorCode_); - if (U_FAILURE(errorCode_)) { - delete clone; - break; + if (ensureSupportedLocaleVector()) { + clearSupportedLocales(); + while (locales.hasNext() && U_SUCCESS(errorCode_)) { + const Locale &locale = locales.next(); + LocalPointer<Locale> clone (locale.clone(), errorCode_); + supportedLocales_->adoptElement(clone.orphan(), errorCode_); } } return *this; } LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) { - if (!ensureSupportedLocaleVector()) { return *this; } - Locale *clone = locale.clone(); - if (clone == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - return *this; - } - supportedLocales_->addElementX(clone, errorCode_); - if (U_FAILURE(errorCode_)) { - delete clone; + if (ensureSupportedLocaleVector()) { + LocalPointer<Locale> clone(locale.clone(), errorCode_); + supportedLocales_->adoptElement(clone.orphan(), errorCode_); } return *this; } diff --git a/thirdparty/icu4c/common/locid.cpp b/thirdparty/icu4c/common/locid.cpp index e8859c7048..73bb8d8aec 100644 --- a/thirdparty/icu4c/common/locid.cpp +++ b/thirdparty/icu4c/common/locid.cpp @@ -1204,14 +1204,11 @@ AliasReplacer::parseLanguageReplacement( // We have multiple field so we have to allocate and parse CharString* str = new CharString( replacement, (int32_t)uprv_strlen(replacement), status); + LocalPointer<CharString> lpStr(str, status); + toBeFreed.adoptElement(lpStr.orphan(), status); if (U_FAILURE(status)) { return; } - if (str == nullptr) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - toBeFreed.addElementX(str, status); char* data = str->data(); replacedLanguage = (const char*) data; char* endOfField = uprv_strchr(data, '_'); @@ -1420,12 +1417,9 @@ AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status) (int32_t)(firstSpace - replacement), status), status); } if (U_FAILURE(status)) { return false; } - if (item.isNull()) { - status = U_MEMORY_ALLOCATION_ERROR; - return false; - } replacedRegion = item->data(); - toBeFreed.addElementX(item.orphan(), status); + toBeFreed.adoptElement(item.orphan(), status); + if (U_FAILURE(status)) { return false; } } U_ASSERT(!same(region, replacedRegion)); region = replacedRegion; @@ -1659,10 +1653,10 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr && U_SUCCESS(status)) { *end = NULL_CHAR; // null terminate inside variantsBuff - variants.addElementX(start, status); + variants.addElement(start, status); start = end + 1; } - variants.addElementX(start, status); + variants.addElement(start, status); } if (U_FAILURE(status)) { return false; } diff --git a/thirdparty/icu4c/common/lstmbe.cpp b/thirdparty/icu4c/common/lstmbe.cpp index 3793abceb3..f6114cdfe2 100644 --- a/thirdparty/icu4c/common/lstmbe.cpp +++ b/thirdparty/icu4c/common/lstmbe.cpp @@ -1,8 +1,8 @@ // © 2021 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html +#include <complex> #include <utility> -#include <ctgmath> #include "unicode/utypes.h" @@ -639,6 +639,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; int32_t beginFoundBreakSize = foundBreaks.size(); diff --git a/thirdparty/icu4c/common/lstmbe.h b/thirdparty/icu4c/common/lstmbe.h index c3f7ecf815..ffdf805eca 100644 --- a/thirdparty/icu4c/common/lstmbe.h +++ b/thirdparty/icu4c/common/lstmbe.h @@ -62,6 +62,7 @@ protected: int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; private: const LSTMData* fData; diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp index 5bfd49e8cb..e6bd75e717 100644 --- a/thirdparty/icu4c/common/normalizer2impl.cpp +++ b/thirdparty/icu4c/common/normalizer2impl.cpp @@ -2496,15 +2496,18 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode // origin is not the first character, or it is U+0000. UnicodeSet *set; if((canonValue&CANON_HAS_SET)==0) { - set=new UnicodeSet; - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; + LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode); + set=lpSet.getAlias(); + if(U_FAILURE(errorCode)) { return; } UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode); - canonStartSets.addElementX(set, errorCode); + canonStartSets.adoptElement(lpSet.orphan(), errorCode); + if (U_FAILURE(errorCode)) { + return; + } if(firstOrigin!=0) { set->add(firstOrigin); } diff --git a/thirdparty/icu4c/common/rbbi.cpp b/thirdparty/icu4c/common/rbbi.cpp index f65177f232..cae8d154b3 100644 --- a/thirdparty/icu4c/common/rbbi.cpp +++ b/thirdparty/icu4c/common/rbbi.cpp @@ -82,6 +82,19 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode } } +//------------------------------------------------------------------------------- +// +// Constructor from a UDataMemory handle to precompiled break rules +// stored in an ICU data file. This construcotr is private API, +// only for internal use. +// +//------------------------------------------------------------------------------- +RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, + UErrorCode &status) : RuleBasedBreakIterator(udm, status) +{ + fIsPhraseBreaking = isPhraseBreaking; +} + // // Construct from precompiled binary rules (tables). This constructor is public API, // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). @@ -322,6 +335,7 @@ void RuleBasedBreakIterator::init(UErrorCode &status) { fBreakCache = nullptr; fDictionaryCache = nullptr; fLookAheadMatches = nullptr; + fIsPhraseBreaking = false; // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. // fText = UTEXT_INITIALIZER; diff --git a/thirdparty/icu4c/common/rbbi_cache.cpp b/thirdparty/icu4c/common/rbbi_cache.cpp index 6bfe3feca4..26d82df781 100644 --- a/thirdparty/icu4c/common/rbbi_cache.cpp +++ b/thirdparty/icu4c/common/rbbi_cache.cpp @@ -163,7 +163,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != NULL) { - foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, status); + foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); } // Reload the loop variables for the next go-round diff --git a/thirdparty/icu4c/common/serv.cpp b/thirdparty/icu4c/common/serv.cpp index 0c54a4dce9..c26dbca1a9 100644 --- a/thirdparty/icu4c/common/serv.cpp +++ b/thirdparty/icu4c/common/serv.cpp @@ -625,10 +625,7 @@ ICUService::getVisibleIDs(UVector& result, const UnicodeString* matchID, UErrorC } } - LocalPointer<UnicodeString> idClone(new UnicodeString(*id), status); - if (U_SUCCESS(status) && idClone->isBogus()) { - status = U_MEMORY_ALLOCATION_ERROR; - } + LocalPointer<UnicodeString> idClone(id->clone(), status); result.adoptElement(idClone.orphan(), status); } delete fallbackKey; diff --git a/thirdparty/icu4c/common/servls.cpp b/thirdparty/icu4c/common/servls.cpp index 7108afd4a5..98f0a8a12b 100644 --- a/thirdparty/icu4c/common/servls.cpp +++ b/thirdparty/icu4c/common/servls.cpp @@ -179,7 +179,8 @@ private: length = other._ids.size(); for(i = 0; i < length; ++i) { - _ids.addElementX(((UnicodeString *)other._ids.elementAt(i))->clone(), status); + LocalPointer<UnicodeString> clonedId(((UnicodeString *)other._ids.elementAt(i))->clone(), status); + _ids.adoptElement(clonedId.orphan(), status); } if(U_SUCCESS(status)) { diff --git a/thirdparty/icu4c/common/servnotf.cpp b/thirdparty/icu4c/common/servnotf.cpp index 342e0d9f24..d9fb388752 100644 --- a/thirdparty/icu4c/common/servnotf.cpp +++ b/thirdparty/icu4c/common/servnotf.cpp @@ -49,7 +49,11 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status) if (acceptsListener(*l)) { Mutex lmx(¬ifyLock); if (listeners == NULL) { - listeners = new UVector(5, status); + LocalPointer<UVector> lpListeners(new UVector(5, status), status); + if (U_FAILURE(status)) { + return; + } + listeners = lpListeners.orphan(); } else { for (int i = 0, e = listeners->size(); i < e; ++i) { const EventListener* el = (const EventListener*)(listeners->elementAt(i)); @@ -59,7 +63,7 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status) } } - listeners->addElementX((void*)l, status); // cast away const + listeners->addElement((void*)l, status); // cast away const } #ifdef NOTIFIER_DEBUG else { @@ -102,13 +106,11 @@ ICUNotifier::removeListener(const EventListener *l, UErrorCode& status) void ICUNotifier::notifyChanged(void) { + Mutex lmx(¬ifyLock); if (listeners != NULL) { - Mutex lmx(¬ifyLock); - if (listeners != NULL) { - for (int i = 0, e = listeners->size(); i < e; ++i) { - EventListener* el = (EventListener*)listeners->elementAt(i); - notifyListener(*el); - } + for (int i = 0, e = listeners->size(); i < e; ++i) { + EventListener* el = (EventListener*)listeners->elementAt(i); + notifyListener(*el); } } } diff --git a/thirdparty/icu4c/common/ubrk.cpp b/thirdparty/icu4c/common/ubrk.cpp index bb5bdd1b50..f4e064961f 100644 --- a/thirdparty/icu4c/common/ubrk.cpp +++ b/thirdparty/icu4c/common/ubrk.cpp @@ -168,7 +168,7 @@ ubrk_safeClone( BreakIterator *newBI = ((BreakIterator *)bi)->clone(); if (newBI == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; - } else { + } else if (pBufferSize != NULL) { *status = U_SAFECLONE_ALLOCATED_WARNING; } return (UBreakIterator *)newBI; @@ -176,15 +176,7 @@ ubrk_safeClone( U_CAPI UBreakIterator * U_EXPORT2 ubrk_clone(const UBreakIterator *bi, UErrorCode *status) { - if (U_FAILURE(*status)) { - return nullptr; - } - BreakIterator *newBI = ((BreakIterator *)bi)->clone(); - if (newBI == nullptr) { - *status = U_MEMORY_ALLOCATION_ERROR; - return nullptr; - } - return (UBreakIterator *)newBI; + return ubrk_safeClone(bi, nullptr, nullptr, status); } diff --git a/thirdparty/icu4c/common/ucase.cpp b/thirdparty/icu4c/common/ucase.cpp index 4aa856507a..388c86b1bb 100644 --- a/thirdparty/icu4c/common/ucase.cpp +++ b/thirdparty/icu4c/common/ucase.cpp @@ -22,27 +22,14 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/uset.h" -#include "unicode/udata.h" /* UDataInfo */ #include "unicode/utf16.h" -#include "ucmndata.h" /* DataHeader */ -#include "udatamem.h" -#include "umutex.h" -#include "uassert.h" #include "cmemory.h" -#include "utrie2.h" +#include "uassert.h" #include "ucase.h" +#include "umutex.h" +#include "utrie2.h" -struct UCaseProps { - UDataMemory *mem; - const int32_t *indexes; - const uint16_t *exceptions; - const uint16_t *unfold; - - UTrie2 trie; - uint8_t formatVersion[4]; -}; - -/* ucase_props_data.h is machine-generated by gencase --csource */ +/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */ #define INCLUDED_FROM_UCASE_CPP #include "ucase_props_data.h" @@ -77,6 +64,13 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { /* data access primitives --------------------------------------------------- */ +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) { + *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions); + *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold); + return &ucase_props_singleton; +} + U_CFUNC const UTrie2 * U_EXPORT2 ucase_getTrie() { return &ucase_props_singleton.trie; @@ -690,7 +684,7 @@ ucase_isCaseSensitive(UChar32 c) { * - The general category of C is * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or * Letter Modifier (Lm), or Symbol Modifier (Sk) - * - C is one of the following characters + * - C is one of the following characters * U+0027 APOSTROPHE * U+00AD SOFT HYPHEN (SHY) * U+2019 RIGHT SINGLE QUOTATION MARK @@ -1064,6 +1058,8 @@ ucase_toFullLower(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_IS_UPPER_OR_TITLE(props)) { @@ -1148,7 +1144,6 @@ ucase_toFullLower(UChar32 c, 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ - *pString=nullptr; return 0; /* remove the dot (continue without output) */ } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) { /* @@ -1215,6 +1210,8 @@ toUpperOrTitle(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { @@ -1252,7 +1249,6 @@ toUpperOrTitle(UChar32 c, 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE */ - *pString=nullptr; return 0; /* remove the dot (continue without output) */ } else if(c==0x0587) { // See ICU-13416: @@ -1449,6 +1445,8 @@ ucase_toFullFolding(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_IS_UPPER_OR_TITLE(props)) { @@ -1542,7 +1540,7 @@ U_CAPI UChar32 U_EXPORT2 u_tolower(UChar32 c) { return ucase_tolower(c); } - + /* Transforms the Unicode character to its upper case equivalent.*/ U_CAPI UChar32 U_EXPORT2 u_toupper(UChar32 c) { diff --git a/thirdparty/icu4c/common/ucase.h b/thirdparty/icu4c/common/ucase.h index a018f82b81..7bf57fd370 100644 --- a/thirdparty/icu4c/common/ucase.h +++ b/thirdparty/icu4c/common/ucase.h @@ -312,6 +312,21 @@ UCaseMapFull(UChar32 c, U_CDECL_END +/* for icuexportdata -------------------------------------------------------- */ + +struct UCaseProps { + void *mem; // TODO: was unused, and type UDataMemory -- remove + const int32_t *indexes; + const uint16_t *exceptions; + const uint16_t *unfold; + + UTrie2 trie; + uint8_t formatVersion[4]; +}; + +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength); + /* file definitions --------------------------------------------------------- */ #define UCASE_DATA_NAME "ucase" diff --git a/thirdparty/icu4c/common/ucasemap.cpp b/thirdparty/icu4c/common/ucasemap.cpp index ed72bda828..95b55d56a0 100644 --- a/thirdparty/icu4c/common/ucasemap.cpp +++ b/thirdparty/icu4c/common/ucasemap.cpp @@ -112,8 +112,7 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { if(length==sizeof(csm->locale)) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } - if(U_SUCCESS(*pErrorCode)) { - csm->caseLocale=UCASE_LOC_UNKNOWN; + if(U_SUCCESS(*pErrorCode)) { csm->caseLocale = ucase_getCaseLocale(csm->locale); } else { csm->locale[0]=0; @@ -420,6 +419,97 @@ void toUpper(int32_t caseLocale, uint32_t options, #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0]; + +constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1]; + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit, + ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { + U_ASSERT(start < segmentLimit); + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar32 c2; + c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) { + withAcute = true; + unchanged1 = 2; // ACUTE is 2 code units in UTF-8 + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) { + return start; + } + if (doTitleJ) { + unchanged2 = 2; // ACUTE is 2 code units in UTF-8 + } else { + unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8 + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U8_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode); + start += unchanged1; + if (doTitleJ) { + ByteSinkUtil::appendCodePoint(1, u'J', sink, edits); + ++start; + } + ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC void U_CALLCONV ucasemap_internalUTF8ToTitle( int32_t caseLocale, uint32_t options, BreakIterator *iter, @@ -504,19 +594,14 @@ ucasemap_internalUTF8ToTitle( } /* Special case Dutch IJ titlecasing */ - if (titleStart+1 < index && - caseLocale == UCASE_LOC_DUTCH && - (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { - if (src[titleStart+1] == 0x006A) { - ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits); - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1, - sink, options, edits, errorCode)) { - return; - } - titleLimit++; + if (titleLimit < index && + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode); } } diff --git a/thirdparty/icu4c/common/ucnv.cpp b/thirdparty/icu4c/common/ucnv.cpp index 5dcf35e043..019bcb6a79 100644 --- a/thirdparty/icu4c/common/ucnv.cpp +++ b/thirdparty/icu4c/common/ucnv.cpp @@ -252,7 +252,10 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U UTRACE_EXIT_STATUS(*status); return NULL; } - *status = U_SAFECLONE_ALLOCATED_WARNING; + // If pBufferSize was NULL as the input, pBufferSize is set to &stackBufferSize in this function. + if (pBufferSize != &stackBufferSize) { + *status = U_SAFECLONE_ALLOCATED_WARNING; + } /* record the fact that memory was allocated */ *pBufferSize = bufferSizeNeeded; @@ -317,7 +320,11 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U return localConverter; } - +U_CAPI UConverter* U_EXPORT2 +ucnv_clone(const UConverter* cnv, UErrorCode *status) +{ + return ucnv_safeClone(cnv, nullptr, nullptr, status); +} /*Decreases the reference counter in the shared immutable section of the object *and frees the mutable part*/ diff --git a/thirdparty/icu4c/common/ucurr.cpp b/thirdparty/icu4c/common/ucurr.cpp index 67aab4e8ff..6e489e0563 100644 --- a/thirdparty/icu4c/common/ucurr.cpp +++ b/thirdparty/icu4c/common/ucurr.cpp @@ -254,7 +254,7 @@ currSymbolsEquiv_cleanup(void) } /** - * Deleter for OlsonToMetaMappingEntry + * Deleter for IsoCodeEntry */ static void U_CALLCONV deleteIsoCodeEntry(void *obj) { diff --git a/thirdparty/icu4c/common/uloc.cpp b/thirdparty/icu4c/common/uloc.cpp index c8a3f1ff73..99c6a0af39 100644 --- a/thirdparty/icu4c/common/uloc.cpp +++ b/thirdparty/icu4c/common/uloc.cpp @@ -186,10 +186,10 @@ NULL }; static const char* const DEPRECATED_LANGUAGES[]={ - "in", "iw", "ji", "jw", NULL, NULL + "in", "iw", "ji", "jw", "mo", NULL, NULL }; static const char* const REPLACEMENT_LANGUAGES[]={ - "id", "he", "yi", "jv", NULL, NULL + "id", "he", "yi", "jv", "ro", NULL, NULL }; /** @@ -444,7 +444,7 @@ static const char * const COUNTRIES_3[] = { /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */ "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF", /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */ - "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE", + "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE", NULL, /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */ "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR", diff --git a/thirdparty/icu4c/common/unicode/localematcher.h b/thirdparty/icu4c/common/unicode/localematcher.h index 252bb7fdc2..0f7e04a3af 100644 --- a/thirdparty/icu4c/common/unicode/localematcher.h +++ b/thirdparty/icu4c/common/unicode/localematcher.h @@ -461,13 +461,13 @@ public: * Option for whether to include or ignore one-way (fallback) match data. * By default, they are included. * - * @param direction the match direction to set. + * @param matchDirection the match direction to set. * @return this Builder object * @stable ICU 67 */ - Builder &setDirection(ULocMatchDirection direction) { + Builder &setDirection(ULocMatchDirection matchDirection) { if (U_SUCCESS(errorCode_)) { - direction_ = direction; + direction_ = matchDirection; } return *this; } diff --git a/thirdparty/icu4c/common/unicode/rbbi.h b/thirdparty/icu4c/common/unicode/rbbi.h index 0ce93819f5..0bad0d3897 100644 --- a/thirdparty/icu4c/common/unicode/rbbi.h +++ b/thirdparty/icu4c/common/unicode/rbbi.h @@ -147,6 +147,11 @@ private: */ int32_t *fLookAheadMatches; + /** + * A flag to indicate if phrase based breaking is enabled. + */ + UBool fIsPhraseBreaking; + //======================================================================= // constructors //======================================================================= @@ -163,6 +168,21 @@ private: */ RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); + /** + * This constructor uses the udata interface to create a BreakIterator + * whose internal tables live in a memory-mapped file. "image" is an + * ICU UDataMemory handle for the pre-compiled break iterator tables. + * @param image handle to the memory image for the break iterator data. + * Ownership of the UDataMemory handle passes to the Break Iterator, + * which will be responsible for closing it when it is no longer needed. + * @param status Information on any errors encountered. + * @param isPhraseBreaking true if phrase based breaking is required, otherwise false. + * @see udata_open + * @see #getBinaryRules + * @internal (private) + */ + RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status); + /** @internal */ friend class RBBIRuleBuilder; /** @internal */ diff --git a/thirdparty/icu4c/common/unicode/ubrk.h b/thirdparty/icu4c/common/unicode/ubrk.h index c603f7c13f..2b3dc7aa57 100644 --- a/thirdparty/icu4c/common/unicode/ubrk.h +++ b/thirdparty/icu4c/common/unicode/ubrk.h @@ -312,11 +312,12 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength, * If *pBufferSize is not enough for a stack-based safe clone, * new memory will be allocated. * @param status to indicate whether the operation went on smoothly or there were errors - * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. + * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used + * if pBufferSize != NULL and any allocations were necessary * @return pointer to the new clone * @deprecated ICU 69 Use ubrk_clone() instead. */ -U_CAPI UBreakIterator * U_EXPORT2 +U_DEPRECATED UBreakIterator * U_EXPORT2 ubrk_safeClone( const UBreakIterator *bi, void *stackBuffer, @@ -325,21 +326,17 @@ ubrk_safeClone( #endif /* U_HIDE_DEPRECATED_API */ -#ifndef U_HIDE_DRAFT_API - /** * Thread safe cloning operation. * @param bi iterator to be cloned * @param status to indicate whether the operation went on smoothly or there were errors * @return pointer to the new clone - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI UBreakIterator * U_EXPORT2 ubrk_clone(const UBreakIterator *bi, UErrorCode *status); -#endif // U_HIDE_DRAFT_API - #ifndef U_HIDE_DEPRECATED_API /** diff --git a/thirdparty/icu4c/common/unicode/ucnv.h b/thirdparty/icu4c/common/unicode/ucnv.h index 2687c984d4..20c173b662 100644 --- a/thirdparty/icu4c/common/unicode/ucnv.h +++ b/thirdparty/icu4c/common/unicode/ucnv.h @@ -477,7 +477,7 @@ ucnv_openCCSID(int32_t codepage, * * <p>The name will NOT be looked up in the alias mechanism, nor will the converter be * stored in the converter cache or the alias table. The only way to open further converters - * is call this function multiple times, or use the ucnv_safeClone() function to clone a + * is call this function multiple times, or use the ucnv_clone() function to clone a * 'primary' converter.</p> * * <p>A future version of ICU may add alias table lookups and/or caching @@ -493,7 +493,7 @@ ucnv_openCCSID(int32_t codepage, * @return the created Unicode converter object, or <TT>NULL</TT> if an error occurred * @see udata_open * @see ucnv_open - * @see ucnv_safeClone + * @see ucnv_clone * @see ucnv_close * @stable ICU 2.2 */ @@ -502,6 +502,20 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode /** * Thread safe converter cloning operation. + * + * You must ucnv_close() the clone. + * + * @param cnv converter to be cloned + * @param status to indicate whether the operation went on smoothly or there were errors + * @return pointer to the new clone + * @stable ICU 71 + */ +U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *status); + +#ifndef U_HIDE_DEPRECATED_API + +/** + * Thread safe converter cloning operation. * For most efficient operation, pass in a stackBuffer (and a *pBufferSize) * with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space. * If the buffer size is sufficient, then the clone will use the stack buffer; @@ -532,21 +546,19 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode * pointer to size of allocated space. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_WARNING, - * is used if any allocations were necessary. + * is used if pBufferSize != NULL and any allocations were necessary * However, it is better to check if *pBufferSize grew for checking for * allocations because warning codes can be overridden by subsequent * function calls. * @return pointer to the new clone - * @stable ICU 2.0 + * @deprecated ICU 71 Use ucnv_clone() instead. */ -U_CAPI UConverter * U_EXPORT2 +U_DEPRECATED UConverter * U_EXPORT2 ucnv_safeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); -#ifndef U_HIDE_DEPRECATED_API - /** * \def U_CNV_SAFECLONE_BUFFERSIZE * Definition of a buffer size that is designed to be large enough for diff --git a/thirdparty/icu4c/common/unicode/uniset.h b/thirdparty/icu4c/common/unicode/uniset.h index 730337a353..310c7c8d20 100644 --- a/thirdparty/icu4c/common/unicode/uniset.h +++ b/thirdparty/icu4c/common/unicode/uniset.h @@ -1229,7 +1229,6 @@ public: */ UnicodeSet& retain(UChar32 c); -#ifndef U_HIDE_DRAFT_API /** * Retains only the specified string from this set if it is present. * Upon return this set will be empty if it did not contain s, or @@ -1238,10 +1237,9 @@ public: * * @param s the source string * @return this object, for chaining - * @draft ICU 69 + * @stable ICU 69 */ UnicodeSet& retain(const UnicodeString &s); -#endif // U_HIDE_DRAFT_API /** * Removes the specified range from this set if it is present. diff --git a/thirdparty/icu4c/common/unicode/urename.h b/thirdparty/icu4c/common/unicode/urename.h index 4605f632ea..d9f9b8f336 100644 --- a/thirdparty/icu4c/common/unicode/urename.h +++ b/thirdparty/icu4c/common/unicode/urename.h @@ -567,6 +567,7 @@ #define ucase_addStringCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addStringCaseClosure) #define ucase_fold U_ICU_ENTRY_POINT_RENAME(ucase_fold) #define ucase_getCaseLocale U_ICU_ENTRY_POINT_RENAME(ucase_getCaseLocale) +#define ucase_getSingleton U_ICU_ENTRY_POINT_RENAME(ucase_getSingleton) #define ucase_getTrie U_ICU_ENTRY_POINT_RENAME(ucase_getTrie) #define ucase_getType U_ICU_ENTRY_POINT_RENAME(ucase_getType) #define ucase_getTypeOrIgnorable U_ICU_ENTRY_POINT_RENAME(ucase_getTypeOrIgnorable) @@ -630,6 +631,7 @@ #define ucnv_cbFromUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteUChars) #define ucnv_cbToUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteSub) #define ucnv_cbToUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteUChars) +#define ucnv_clone U_ICU_ENTRY_POINT_RENAME(ucnv_clone) #define ucnv_close U_ICU_ENTRY_POINT_RENAME(ucnv_close) #define ucnv_compareNames U_ICU_ENTRY_POINT_RENAME(ucnv_compareNames) #define ucnv_convert U_ICU_ENTRY_POINT_RENAME(ucnv_convert) @@ -725,6 +727,7 @@ #define ucnvsel_selectForString U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForString) #define ucnvsel_selectForUTF8 U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForUTF8) #define ucnvsel_serialize U_ICU_ENTRY_POINT_RENAME(ucnvsel_serialize) +#define ucol_clone U_ICU_ENTRY_POINT_RENAME(ucol_clone) #define ucol_cloneBinary U_ICU_ENTRY_POINT_RENAME(ucol_cloneBinary) #define ucol_close U_ICU_ENTRY_POINT_RENAME(ucol_close) #define ucol_closeElements U_ICU_ENTRY_POINT_RENAME(ucol_closeElements) @@ -904,6 +907,7 @@ #define udatpg_getBestPattern U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPattern) #define udatpg_getBestPatternWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPatternWithOptions) #define udatpg_getDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormat) +#define udatpg_getDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormatForStyle) #define udatpg_getDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_getDecimal) #define udatpg_getDefaultHourCycle U_ICU_ENTRY_POINT_RENAME(udatpg_getDefaultHourCycle) #define udatpg_getFieldDisplayName U_ICU_ENTRY_POINT_RENAME(udatpg_getFieldDisplayName) @@ -918,6 +922,7 @@ #define udatpg_setAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemFormat) #define udatpg_setAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemName) #define udatpg_setDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormat) +#define udatpg_setDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormatForStyle) #define udatpg_setDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_setDecimal) #define udict_swap U_ICU_ENTRY_POINT_RENAME(udict_swap) #define udtitvfmt_close U_ICU_ENTRY_POINT_RENAME(udtitvfmt_close) diff --git a/thirdparty/icu4c/common/unicode/uset.h b/thirdparty/icu4c/common/unicode/uset.h index 2ef352ef56..33332f2d36 100644 --- a/thirdparty/icu4c/common/unicode/uset.h +++ b/thirdparty/icu4c/common/unicode/uset.h @@ -628,7 +628,6 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end); U_CAPI void U_EXPORT2 uset_removeString(USet* set, const UChar* str, int32_t strLen); -#ifndef U_HIDE_DRAFT_API /** * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} * A frozen set will not be modified. @@ -636,11 +635,10 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Removes from this set all of its elements that are contained in the @@ -671,7 +669,6 @@ uset_removeAll(USet* set, const USet* removeSet); U_CAPI void U_EXPORT2 uset_retain(USet* set, UChar32 start, UChar32 end); -#ifndef U_HIDE_DRAFT_API /** * Retains only the specified string from this set if it is present. * Upon return this set will be empty if it did not contain s, or @@ -681,7 +678,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainString(USet *set, const UChar *str, int32_t length); @@ -693,11 +690,10 @@ uset_retainString(USet *set, const UChar *str, int32_t length); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Retains only the elements in this set that are contained in the @@ -741,7 +737,6 @@ uset_compact(USet* set); U_CAPI void U_EXPORT2 uset_complement(USet* set); -#ifndef U_HIDE_DRAFT_API /** * Complements the specified range in this set. Any character in * the range will be removed if it is in this set, or will be @@ -753,7 +748,7 @@ uset_complement(USet* set); * @param set the object to be modified * @param start first character, inclusive, of range * @param end last character, inclusive, of range - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementRange(USet *set, UChar32 start, UChar32 end); @@ -766,7 +761,7 @@ uset_complementRange(USet *set, UChar32 start, UChar32 end); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementString(USet *set, const UChar *str, int32_t length); @@ -778,11 +773,10 @@ uset_complementString(USet *set, const UChar *str, int32_t length); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Complements in this set all elements contained in the specified diff --git a/thirdparty/icu4c/common/unicode/uvernum.h b/thirdparty/icu4c/common/unicode/uvernum.h index 42e8865d7e..2706e0b060 100644 --- a/thirdparty/icu4c/common/unicode/uvernum.h +++ b/thirdparty/icu4c/common/unicode/uvernum.h @@ -60,7 +60,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.4 */ -#define U_ICU_VERSION_MAJOR_NUM 70 +#define U_ICU_VERSION_MAJOR_NUM 71 /** The current ICU minor version as an integer. * This value will change in the subsequent releases of ICU @@ -86,7 +86,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.6 */ -#define U_ICU_VERSION_SUFFIX _70 +#define U_ICU_VERSION_SUFFIX _71 /** * \def U_DEF2_ICU_ENTRY_POINT_RENAME @@ -139,7 +139,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.4 */ -#define U_ICU_VERSION "70.1" +#define U_ICU_VERSION "71.1" /** * The current ICU library major version number as a string, for library name suffixes. @@ -152,13 +152,13 @@ * * @stable ICU 2.6 */ -#define U_ICU_VERSION_SHORT "70" +#define U_ICU_VERSION_SHORT "71" #ifndef U_HIDE_INTERNAL_API /** Data version in ICU4C. * @internal ICU 4.4 Internal Use Only **/ -#define U_ICU_DATA_VERSION "70.1" +#define U_ICU_DATA_VERSION "71.1" #endif /* U_HIDE_INTERNAL_API */ /*=========================================================================== diff --git a/thirdparty/icu4c/common/unistr.cpp b/thirdparty/icu4c/common/unistr.cpp index 077b4d6ef2..c18665928d 100644 --- a/thirdparty/icu4c/common/unistr.cpp +++ b/thirdparty/icu4c/common/unistr.cpp @@ -334,7 +334,8 @@ Replaceable::clone() const { // UnicodeString overrides clone() with a real implementation UnicodeString * UnicodeString::clone() const { - return new UnicodeString(*this); + LocalPointer<UnicodeString> clonedString(new UnicodeString(*this)); + return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr; } //======================================== @@ -1976,7 +1977,12 @@ The vector deleting destructor is already a part of UObject, but defining it here makes sure that it is included with this object file. This makes sure that static library dependencies are kept to a minimum. */ +#if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" static void uprv_UnicodeStringDummy(void) { delete [] (new UnicodeString[2]); } +#pragma GCC diagnostic pop +#endif #endif diff --git a/thirdparty/icu4c/common/ustrcase.cpp b/thirdparty/icu4c/common/ustrcase.cpp index 36b19e75f2..43910ea520 100644 --- a/thirdparty/icu4c/common/ustrcase.cpp +++ b/thirdparty/icu4c/common/ustrcase.cpp @@ -36,6 +36,12 @@ #include "ustr_imp.h" #include "uassert.h" +/** + * Code point for COMBINING ACUTE ACCENT + * @internal + */ +#define ACUTE u'\u0301' + U_NAMESPACE_BEGIN namespace { @@ -396,6 +402,94 @@ U_NAMESPACE_USE #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit, + UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options, + icu::Edits *edits) { + U_ASSERT(start < segmentLimit); + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE) { + withAcute = true; + unchanged1 = 1; + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if (index == segmentLimit || src[index++] != ACUTE) { return start; } + if (doTitleJ) { + unchanged2 = 1; + } else { + ++unchanged1; + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U16_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits); + start += unchanged1; + if (doTitleJ) { + destIndex = appendUChar(dest, destIndex, destCapacity, u'J'); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + ++start; + } + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC int32_t U_CALLCONV ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, UChar *dest, int32_t destCapacity, @@ -412,14 +506,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it csc.limit=srcLength; int32_t destIndex=0; int32_t prev=0; - UBool isFirstIndex=TRUE; + bool isFirstIndex=true; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ int32_t index; if(isFirstIndex) { - isFirstIndex=FALSE; + isFirstIndex=false; index=iter->first(); } else { index=iter->next(); @@ -446,7 +540,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it // Stop with titleStart<titleLimit<=index // if there is a character to be titlecased, // or else stop with titleStart==titleLimit==index. - UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; + bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { titleStart=titleLimit; if(titleLimit==index) { @@ -479,27 +573,15 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it /* Special case Dutch IJ titlecasing */ if (titleStart+1 < index && - caseLocale == UCASE_LOC_DUTCH && - (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { - if (src[titleStart+1] == 0x006A) { - destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - if(edits!=NULL) { - edits->addReplace(1, 1); - } - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+titleStart+1, 1, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - titleLimit++; + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, + dest, destIndex, destCapacity, options, + edits); } } diff --git a/thirdparty/icu4c/common/uvector.cpp b/thirdparty/icu4c/common/uvector.cpp index 4da8b864e1..844463921e 100644 --- a/thirdparty/icu4c/common/uvector.cpp +++ b/thirdparty/icu4c/common/uvector.cpp @@ -99,14 +99,6 @@ bool UVector::operator==(const UVector& other) const { return true; } -// TODO: delete this function once all call sites have been migrated to the -// new addElement(). -void UVector::addElementX(void* obj, UErrorCode &status) { - if (ensureCapacityX(count + 1, status)) { - elements[count++].pointer = obj; - } -} - void UVector::addElement(void* obj, UErrorCode &status) { U_ASSERT(deleter == nullptr); if (ensureCapacity(count + 1, status)) { @@ -331,38 +323,6 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const { return -1; } -UBool UVector::ensureCapacityX(int32_t minimumCapacity, UErrorCode &status) { - if (minimumCapacity < 0) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - if (capacity < minimumCapacity) { - if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - int32_t newCap = capacity * 2; - if (newCap < minimumCapacity) { - newCap = minimumCapacity; - } - if (newCap > (int32_t)(INT32_MAX / sizeof(UElement))) { // integer overflow check - // We keep the original memory contents on bad minimumCapacity. - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - UElement* newElems = (UElement *)uprv_realloc(elements, sizeof(UElement)*newCap); - if (newElems == nullptr) { - // We keep the original contents on the memory failure on realloc or bad minimumCapacity. - status = U_MEMORY_ALLOCATION_ERROR; - return FALSE; - } - elements = newElems; - capacity = newCap; - } - return TRUE; -} - - UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { if (U_FAILURE(status)) { return false; @@ -370,7 +330,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { if (minimumCapacity < 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return false; - } + } if (capacity < minimumCapacity) { if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check status = U_ILLEGAL_ARGUMENT_ERROR; @@ -396,6 +356,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { } return true; } + /** * Change the size of this vector as follows: If newSize is smaller, * then truncate the array, possibly deleting held elements for i >= diff --git a/thirdparty/icu4c/common/uvector.h b/thirdparty/icu4c/common/uvector.h index f61fcc2be6..1eb7d136e7 100644 --- a/thirdparty/icu4c/common/uvector.h +++ b/thirdparty/icu4c/common/uvector.h @@ -123,12 +123,6 @@ public: // java.util.Vector API //------------------------------------------------------------ - /* - * Old version of addElement, with non-standard error handling. - * Will be removed once all uses have been switched to the new addElement(). - */ - void addElementX(void* obj, UErrorCode &status); - /** * Add an element at the end of the vector. * For use only with vectors that do not adopt their elements, which is to say, @@ -197,12 +191,6 @@ public: inline UBool isEmpty(void) const {return count == 0;} - /* - * Old version of ensureCapacity, with non-standard error handling. - * Will be removed once all uses have been switched to the new ensureCapacity(). - */ - UBool ensureCapacityX(int32_t minimumCapacity, UErrorCode &status); - UBool ensureCapacity(int32_t minimumCapacity, UErrorCode &status); /** diff --git a/thirdparty/icu4c/common/uvectr32.cpp b/thirdparty/icu4c/common/uvectr32.cpp index a77ecb689f..2b4d0b8a75 100644 --- a/thirdparty/icu4c/common/uvectr32.cpp +++ b/thirdparty/icu4c/common/uvectr32.cpp @@ -83,7 +83,7 @@ void UVector32::assign(const UVector32& other, UErrorCode &ec) { } -bool UVector32::operator==(const UVector32& other) { +bool UVector32::operator==(const UVector32& other) const { int32_t i; if (count != other.count) return false; for (i=0; i<count; ++i) { diff --git a/thirdparty/icu4c/common/uvectr32.h b/thirdparty/icu4c/common/uvectr32.h index f08c2ade76..ecefb7af3e 100644 --- a/thirdparty/icu4c/common/uvectr32.h +++ b/thirdparty/icu4c/common/uvectr32.h @@ -86,12 +86,12 @@ public: * equal if they are of the same size and all elements are equal, * as compared using this object's comparer. */ - bool operator==(const UVector32& other); + bool operator==(const UVector32& other) const; /** * Equivalent to !operator==() */ - inline bool operator!=(const UVector32& other); + inline bool operator!=(const UVector32& other) const; //------------------------------------------------------------ // java.util.Vector API @@ -268,7 +268,7 @@ inline int32_t UVector32::lastElementi(void) const { return elementAti(count-1); } -inline bool UVector32::operator!=(const UVector32& other) { +inline bool UVector32::operator!=(const UVector32& other) const { return !operator==(other); } diff --git a/thirdparty/icu4c/godot_data.json b/thirdparty/icu4c/godot_data.json index 3a9c28af4c..e36e2b078b 100644 --- a/thirdparty/icu4c/godot_data.json +++ b/thirdparty/icu4c/godot_data.json @@ -6,5 +6,6 @@ brkitr_tree: include misc: include normalization: include + confusables: include } -}
\ No newline at end of file +} diff --git a/thirdparty/icu4c/i18n/scriptset.cpp b/thirdparty/icu4c/i18n/scriptset.cpp new file mode 100644 index 0000000000..6a1db8c01c --- /dev/null +++ b/thirdparty/icu4c/i18n/scriptset.cpp @@ -0,0 +1,313 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* scriptset.cpp +* +* created on: 2013 Jan 7 +* created by: Andy Heninger +*/ + +#include "unicode/utypes.h" + +#include "unicode/uchar.h" +#include "unicode/unistr.h" + +#include "scriptset.h" +#include "uassert.h" +#include "cmemory.h" + +U_NAMESPACE_BEGIN + +//---------------------------------------------------------------------------- +// +// ScriptSet implementation +// +//---------------------------------------------------------------------------- +ScriptSet::ScriptSet() { + uprv_memset(bits, 0, sizeof(bits)); +} + +ScriptSet::~ScriptSet() { +} + +ScriptSet::ScriptSet(const ScriptSet &other) { + *this = other; +} + +ScriptSet & ScriptSet::operator =(const ScriptSet &other) { + uprv_memcpy(bits, other.bits, sizeof(bits)); + return *this; +} + +bool ScriptSet::operator == (const ScriptSet &other) const { + for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + if (bits[i] != other.bits[i]) { + return false; + } + } + return true; +} + +UBool ScriptSet::test(UScriptCode script, UErrorCode &status) const { + if (U_FAILURE(status)) { + return FALSE; + } + if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return FALSE; + } + uint32_t index = script / 32; + uint32_t bit = 1 << (script & 31); + return ((bits[index] & bit) != 0); +} + + +ScriptSet &ScriptSet::set(UScriptCode script, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + uint32_t index = script / 32; + uint32_t bit = 1 << (script & 31); + bits[index] |= bit; + return *this; +} + +ScriptSet &ScriptSet::reset(UScriptCode script, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return *this; + } + uint32_t index = script / 32; + uint32_t bit = 1 << (script & 31); + bits[index] &= ~bit; + return *this; +} + + + +ScriptSet &ScriptSet::Union(const ScriptSet &other) { + for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + bits[i] |= other.bits[i]; + } + return *this; +} + +ScriptSet &ScriptSet::intersect(const ScriptSet &other) { + for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + bits[i] &= other.bits[i]; + } + return *this; +} + +ScriptSet &ScriptSet::intersect(UScriptCode script, UErrorCode &status) { + ScriptSet t; + t.set(script, status); + if (U_SUCCESS(status)) { + this->intersect(t); + } + return *this; +} + +UBool ScriptSet::intersects(const ScriptSet &other) const { + for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + if ((bits[i] & other.bits[i]) != 0) { + return true; + } + } + return false; +} + +UBool ScriptSet::contains(const ScriptSet &other) const { + ScriptSet t(*this); + t.intersect(other); + return (t == other); +} + + +ScriptSet &ScriptSet::setAll() { + for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + bits[i] = 0xffffffffu; + } + return *this; +} + + +ScriptSet &ScriptSet::resetAll() { + uprv_memset(bits, 0, sizeof(bits)); + return *this; +} + +int32_t ScriptSet::countMembers() const { + // This bit counter is good for sparse numbers of '1's, which is + // very much the case that we will usually have. + int32_t count = 0; + for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + uint32_t x = bits[i]; + while (x > 0) { + count++; + x &= (x - 1); // and off the least significant one bit. + } + } + return count; +} + +int32_t ScriptSet::hashCode() const { + int32_t hash = 0; + for (int32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + hash ^= bits[i]; + } + return hash; +} + +int32_t ScriptSet::nextSetBit(int32_t fromIndex) const { + // TODO: Wants a better implementation. + if (fromIndex < 0) { + return -1; + } + UErrorCode status = U_ZERO_ERROR; + for (int32_t scriptIndex = fromIndex; scriptIndex < SCRIPT_LIMIT; scriptIndex++) { + if (test((UScriptCode)scriptIndex, status)) { + return scriptIndex; + } + } + return -1; +} + +UBool ScriptSet::isEmpty() const { + for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) { + if (bits[i] != 0) { + return FALSE; + } + } + return TRUE; +} + +UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const { + UBool firstTime = TRUE; + for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) { + if (!firstTime) { + dest.append((UChar)0x20); + } + firstTime = FALSE; + const char *scriptName = uscript_getShortName((UScriptCode(i))); + dest.append(UnicodeString(scriptName, -1, US_INV)); + } + return dest; +} + +ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode &status) { + resetAll(); + if (U_FAILURE(status)) { + return *this; + } + UnicodeString oneScriptName; + for (int32_t i=0; i<scriptString.length();) { + UChar32 c = scriptString.char32At(i); + i = scriptString.moveIndex32(i, 1); + if (!u_isUWhiteSpace(c)) { + oneScriptName.append(c); + if (i < scriptString.length()) { + continue; + } + } + if (oneScriptName.length() > 0) { + char buf[40]; + oneScriptName.extract(0, oneScriptName.length(), buf, sizeof(buf)-1, US_INV); + buf[sizeof(buf)-1] = 0; + int32_t sc = u_getPropertyValueEnum(UCHAR_SCRIPT, buf); + if (sc == UCHAR_INVALID_CODE) { + status = U_ILLEGAL_ARGUMENT_ERROR; + } else { + this->set((UScriptCode)sc, status); + } + if (U_FAILURE(status)) { + return *this; + } + oneScriptName.remove(); + } + } + return *this; +} + +void ScriptSet::setScriptExtensions(UChar32 codePoint, UErrorCode& status) { + if (U_FAILURE(status)) { return; } + static const int32_t FIRST_GUESS_SCRIPT_CAPACITY = 20; + MaybeStackArray<UScriptCode,FIRST_GUESS_SCRIPT_CAPACITY> scripts; + UErrorCode internalStatus = U_ZERO_ERROR; + int32_t script_count = -1; + + while (TRUE) { + script_count = uscript_getScriptExtensions( + codePoint, scripts.getAlias(), scripts.getCapacity(), &internalStatus); + if (internalStatus == U_BUFFER_OVERFLOW_ERROR) { + // Need to allocate more space + if (scripts.resize(script_count) == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + internalStatus = U_ZERO_ERROR; + } else { + break; + } + } + + // Check if we failed for some reason other than buffer overflow + if (U_FAILURE(internalStatus)) { + status = internalStatus; + return; + } + + // Load the scripts into the ScriptSet and return + for (int32_t i = 0; i < script_count; i++) { + this->set(scripts[i], status); + if (U_FAILURE(status)) { return; } + } +} + +U_NAMESPACE_END + +U_CAPI UBool U_EXPORT2 +uhash_equalsScriptSet(const UElement key1, const UElement key2) { + icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer); + icu::ScriptSet *s2 = static_cast<icu::ScriptSet *>(key2.pointer); + return (*s1 == *s2); +} + +U_CAPI int8_t U_EXPORT2 +uhash_compareScriptSet(UElement key0, UElement key1) { + icu::ScriptSet *s0 = static_cast<icu::ScriptSet *>(key0.pointer); + icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer); + int32_t diff = s0->countMembers() - s1->countMembers(); + if (diff != 0) return static_cast<UBool>(diff); + int32_t i0 = s0->nextSetBit(0); + int32_t i1 = s1->nextSetBit(0); + while ((diff = i0-i1) == 0 && i0 > 0) { + i0 = s0->nextSetBit(i0+1); + i1 = s1->nextSetBit(i1+1); + } + return (int8_t)diff; +} + +U_CAPI int32_t U_EXPORT2 +uhash_hashScriptSet(const UElement key) { + icu::ScriptSet *s = static_cast<icu::ScriptSet *>(key.pointer); + return s->hashCode(); +} + +U_CAPI void U_EXPORT2 +uhash_deleteScriptSet(void *obj) { + icu::ScriptSet *s = static_cast<icu::ScriptSet *>(obj); + delete s; +} diff --git a/thirdparty/icu4c/i18n/scriptset.h b/thirdparty/icu4c/i18n/scriptset.h new file mode 100644 index 0000000000..51980ab7b3 --- /dev/null +++ b/thirdparty/icu4c/i18n/scriptset.h @@ -0,0 +1,86 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2013, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* +* scriptset.h +* +* created on: 2013 Jan 7 +* created by: Andy Heninger +*/ + +#ifndef __SCRIPTSET_H__ +#define __SCRIPTSET_H__ + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "unicode/uscript.h" + +#include "uelement.h" + +U_NAMESPACE_BEGIN + +//------------------------------------------------------------------------------- +// +// ScriptSet - A bit set representing a set of scripts. +// +// This class was originally used exclusively with script sets appearing +// as part of the spoof check whole script confusable binary data. Its +// use has since become more general, but the continued use to wrap +// prebuilt binary data does constrain the design. +// +//------------------------------------------------------------------------------- +class U_I18N_API ScriptSet: public UMemory { + public: + static constexpr int32_t SCRIPT_LIMIT = 224; // multiple of 32! + + ScriptSet(); + ScriptSet(const ScriptSet &other); + ~ScriptSet(); + + bool operator == (const ScriptSet &other) const; + bool operator != (const ScriptSet &other) const {return !(*this == other);} + ScriptSet & operator = (const ScriptSet &other); + + UBool test(UScriptCode script, UErrorCode &status) const; + ScriptSet &Union(const ScriptSet &other); + ScriptSet &set(UScriptCode script, UErrorCode &status); + ScriptSet &reset(UScriptCode script, UErrorCode &status); + ScriptSet &intersect(const ScriptSet &other); + ScriptSet &intersect(UScriptCode script, UErrorCode &status); + UBool intersects(const ScriptSet &other) const; // Sets contain at least one script in common. + UBool contains(const ScriptSet &other) const; // All set bits in other are also set in this. + + ScriptSet &setAll(); + ScriptSet &resetAll(); + int32_t countMembers() const; + int32_t hashCode() const; + int32_t nextSetBit(int32_t script) const; + + UBool isEmpty() const; + + UnicodeString &displayScripts(UnicodeString &dest) const; // append script names to dest string. + ScriptSet & parseScripts(const UnicodeString &scriptsString, UErrorCode &status); // Replaces ScriptSet contents. + + // Wraps around UScript::getScriptExtensions() and adds the corresponding scripts to this instance. + void setScriptExtensions(UChar32 codePoint, UErrorCode& status); + + private: + uint32_t bits[SCRIPT_LIMIT / 32]; +}; + +U_NAMESPACE_END + +U_CAPI UBool U_EXPORT2 +uhash_compareScriptSet(const UElement key1, const UElement key2); + +U_CAPI int32_t U_EXPORT2 +uhash_hashScriptSet(const UElement key); + +U_CAPI void U_EXPORT2 +uhash_deleteScriptSet(void *obj); + +#endif // __SCRIPTSET_H__ diff --git a/thirdparty/icu4c/i18n/ucln_in.cpp b/thirdparty/icu4c/i18n/ucln_in.cpp new file mode 100644 index 0000000000..f29cbe41dd --- /dev/null +++ b/thirdparty/icu4c/i18n/ucln_in.cpp @@ -0,0 +1,65 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* * +* Copyright (C) 2001-2014, International Business Machines * +* Corporation and others. All Rights Reserved. * +* * +****************************************************************************** +* file name: ucln_in.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#include "ucln.h" +#include "ucln_in.h" +#include "mutex.h" +#include "uassert.h" + +/** Auto-client for UCLN_I18N **/ +#define UCLN_TYPE UCLN_I18N +#include "ucln_imp.h" + +/* Leave this copyright notice here! It needs to go somewhere in this library. */ +static const char copyright[] = U_COPYRIGHT_STRING; + +static cleanupFunc *gCleanupFunctions[UCLN_I18N_COUNT]; + +static UBool U_CALLCONV i18n_cleanup(void) +{ + int32_t libType = UCLN_I18N_START; + (void)copyright; /* Suppress unused variable warning with clang. */ + + while (++libType<UCLN_I18N_COUNT) { + if (gCleanupFunctions[libType]) + { + gCleanupFunctions[libType](); + gCleanupFunctions[libType] = NULL; + } + } +#if !UCLN_NO_AUTO_CLEANUP && (defined(UCLN_AUTO_ATEXIT) || defined(UCLN_AUTO_LOCAL)) + ucln_unRegisterAutomaticCleanup(); +#endif + return TRUE; +} + +void ucln_i18n_registerCleanup(ECleanupI18NType type, + cleanupFunc *func) { + U_ASSERT(UCLN_I18N_START < type && type < UCLN_I18N_COUNT); + { + icu::Mutex m; // See ticket 10295 for discussion. + ucln_registerCleanup(UCLN_I18N, i18n_cleanup); + if (UCLN_I18N_START < type && type < UCLN_I18N_COUNT) { + gCleanupFunctions[type] = func; + } + } +#if !UCLN_NO_AUTO_CLEANUP && (defined(UCLN_AUTO_ATEXIT) || defined(UCLN_AUTO_LOCAL)) + ucln_registerAutomaticCleanup(); +#endif +} + diff --git a/thirdparty/icu4c/i18n/ucln_in.h b/thirdparty/icu4c/i18n/ucln_in.h new file mode 100644 index 0000000000..765cdd559f --- /dev/null +++ b/thirdparty/icu4c/i18n/ucln_in.h @@ -0,0 +1,74 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +****************************************************************************** +* Copyright (C) 2001-2016, International Business Machines +* Corporation and others. All Rights Reserved. +****************************************************************************** +* file name: ucln_in.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2001July05 +* created by: George Rhoten +*/ + +#ifndef __UCLN_IN_H__ +#define __UCLN_IN_H__ + +#include "unicode/utypes.h" +#include "ucln.h" + +/* +Please keep the order of enums declared in same order +as the functions are suppose to be called. +It's usually best to have child dependencies called first. */ +typedef enum ECleanupI18NType { + UCLN_I18N_START = -1, + UCLN_I18N_UNIT_EXTRAS, + UCLN_I18N_NUMBER_SKELETONS, + UCLN_I18N_CURRENCY_SPACING, + UCLN_I18N_SPOOF, + UCLN_I18N_SPOOFDATA, + UCLN_I18N_TRANSLITERATOR, + UCLN_I18N_REGEX, + UCLN_I18N_JAPANESE_CALENDAR, + UCLN_I18N_ISLAMIC_CALENDAR, + UCLN_I18N_CHINESE_CALENDAR, + UCLN_I18N_HEBREW_CALENDAR, + UCLN_I18N_ASTRO_CALENDAR, + UCLN_I18N_DANGI_CALENDAR, + UCLN_I18N_CALENDAR, + UCLN_I18N_TIMEZONEFORMAT, + UCLN_I18N_TZDBTIMEZONENAMES, + UCLN_I18N_TIMEZONEGENERICNAMES, + UCLN_I18N_TIMEZONENAMES, + UCLN_I18N_ZONEMETA, + UCLN_I18N_TIMEZONE, + UCLN_I18N_DIGITLIST, + UCLN_I18N_DECFMT, + UCLN_I18N_NUMFMT, + UCLN_I18N_ALLOWED_HOUR_FORMATS, + UCLN_I18N_DAYPERIODRULES, + UCLN_I18N_SMPDTFMT, + UCLN_I18N_USEARCH, + UCLN_I18N_COLLATOR, + UCLN_I18N_UCOL_RES, + UCLN_I18N_CSDET, + UCLN_I18N_COLLATION_ROOT, + UCLN_I18N_GENDERINFO, + UCLN_I18N_CDFINFO, + UCLN_I18N_REGION, + UCLN_I18N_LIST_FORMATTER, + UCLN_I18N_NUMSYS, + UCLN_I18N_COUNT /* This must be last */ +} ECleanupI18NType; + +/* Main library cleanup registration function. */ +/* See common/ucln.h for details on adding a cleanup function. */ +/* Note: the global mutex must not be held when calling this function. */ +U_CFUNC void U_EXPORT2 ucln_i18n_registerCleanup(ECleanupI18NType type, + cleanupFunc *func); + +#endif diff --git a/thirdparty/icu4c/i18n/unicode/uspoof.h b/thirdparty/icu4c/i18n/unicode/uspoof.h new file mode 100644 index 0000000000..b674c91b2c --- /dev/null +++ b/thirdparty/icu4c/i18n/unicode/uspoof.h @@ -0,0 +1,1577 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +*************************************************************************** +* Copyright (C) 2008-2016, International Business Machines Corporation +* and others. All Rights Reserved. +*************************************************************************** +* file name: uspoof.h +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2008Feb13 +* created by: Andy Heninger +* +* Unicode Spoof Detection +*/ + +#ifndef USPOOF_H +#define USPOOF_H + +#include "unicode/utypes.h" +#include "unicode/uset.h" +#include "unicode/parseerr.h" + +#if !UCONFIG_NO_NORMALIZATION + + +#if U_SHOW_CPLUSPLUS_API +#include "unicode/localpointer.h" +#include "unicode/unistr.h" +#include "unicode/uniset.h" +#endif + + +/** + * \file + * \brief Unicode Security and Spoofing Detection, C API. + * + * <p> + * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and + * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: + * + * <ol> + * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and + * "Ηarvest", where the second string starts with the Greek capital letter Eta.</li> + * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof + * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li> + * </ol> + * + * <p> + * Although originally designed as a method for flagging suspicious identifier strings such as URLs, + * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word + * content filters. + * + * <p> + * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++. + * + * <h2>Confusables</h2> + * + * <p> + * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings: + * + * \code{.c} + * UErrorCode status = U_ZERO_ERROR; + * UChar* str1 = (UChar*) u"Harvest"; + * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA + * + * USpoofChecker* sc = uspoof_open(&status); + * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); + * + * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status); + * UBool result = bitmask != 0; + * // areConfusable: 1 (status: U_ZERO_ERROR) + * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); + * uspoof_close(sc); + * \endcode + * + * <p> + * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks} + * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the + * confusability test; and the following line extracts the result out of the return value. For best performance, + * the instance should be created once (e.g., upon application startup), and the efficient + * {@link uspoof_areConfusable} method can be used at runtime. + * + * <p> + * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call + * {@link uspoof_close} when the object goes out of scope: + * + * \code{.cpp} + * UErrorCode status = U_ZERO_ERROR; + * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); + * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status); + * // ... + * \endcode + * + * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can + * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so + * the following snippet is equivalent to the example above: + * + * \code{.c} + * UErrorCode status = U_ZERO_ERROR; + * UChar* str1 = (UChar*) u"Harvest"; + * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA + * + * USpoofChecker* sc = uspoof_open(&status); + * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); + * + * // Get skeleton 1 + * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); + * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar)); + * status = U_ZERO_ERROR; + * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); + * + * // Get skeleton 2 + * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); + * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar)); + * status = U_ZERO_ERROR; + * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); + * + * // Are the skeletons the same? + * UBool result = u_strcmp(skel1, skel2) == 0; + * // areConfusable: 1 (status: U_ZERO_ERROR) + * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); + * uspoof_close(sc); + * free(skel1); + * free(skel2); + * \endcode + * + * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling + * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below: + * + * \code{.c} + * UErrorCode status = U_ZERO_ERROR; + * #define DICTIONARY_LENGTH 2 + * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; + * UChar* skeletons[DICTIONARY_LENGTH]; + * UChar* str = (UChar*) u"1orern"; + * + * // Setup: + * USpoofChecker* sc = uspoof_open(&status); + * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); + * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { + * UChar* word = dictionary[i]; + * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status); + * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar)); + * status = U_ZERO_ERROR; + * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status); + * } + * + * // Live Check: + * { + * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status); + * UChar* skel = (UChar*) malloc(++len * sizeof(UChar)); + * status = U_ZERO_ERROR; + * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status); + * UBool result = false; + * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { + * result = u_strcmp(skel, skeletons[i]) == 0; + * if (result == true) { break; } + * } + * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR) + * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status)); + * free(skel); + * } + * + * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { + * free(skeletons[i]); + * } + * uspoof_close(sc); + * \endcode + * + * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> + * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons + * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. + * + * <h2>Spoof Detection</h2> + * + * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a + * string: + * + * \code{.c} + * UErrorCode status = U_ZERO_ERROR; + * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A + * + * // Get the default set of allowable characters: + * USet* allowed = uset_openEmpty(); + * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); + * uset_addAll(allowed, uspoof_getInclusionSet(&status)); + * + * USpoofChecker* sc = uspoof_open(&status); + * uspoof_setAllowedChars(sc, allowed, &status); + * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); + * + * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status); + * UBool result = bitmask != 0; + * // fails checks: 1 (status: U_ZERO_ERROR) + * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); + * uspoof_close(sc); + * uset_close(allowed); + * \endcode + * + * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at + * startup, and call the cheaper {@link uspoof_check} online. We specify the set of + * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. + * + * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings, + * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers. + * + * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks + * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions + * with a {@link USpoofCheckResult} parameter: + * + * \code{.c} + * UErrorCode status = U_ZERO_ERROR; + * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A + * + * // Get the default set of allowable characters: + * USet* allowed = uset_openEmpty(); + * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); + * uset_addAll(allowed, uspoof_getInclusionSet(&status)); + * + * USpoofChecker* sc = uspoof_open(&status); + * uspoof_setAllowedChars(sc, allowed, &status); + * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); + * + * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status); + * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status); + * + * int32_t failures1 = bitmask; + * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status); + * assert(failures1 == failures2); + * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) + * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); + * + * // Cleanup: + * uspoof_close(sc); + * uset_close(allowed); + * uspoof_closeCheckResult(checkResult); + * \endcode + * + * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally + * equivalent to the one above: + * + * \code{.cpp} + * UErrorCode status = U_ZERO_ERROR; + * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A + * + * // Get the default set of allowable characters: + * UnicodeSet allowed; + * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); + * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); + * + * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); + * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); + * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); + * + * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); + * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); + * + * int32_t failures1 = bitmask; + * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status); + * assert(failures1 == failures2); + * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) + * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); + * + * // Explicit cleanup not necessary. + * \endcode + * + * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: + * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: + * + * <ul> + * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the + * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS + * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> + * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character + * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> + * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable + * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li> + * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> + * </ul> + * + * <p> + * These checks can be enabled independently of each other. For example, if you were interested in checking for only the + * INVISIBLE and MIXED_NUMBERS conditions, you could do: + * + * \code{.c} + * UErrorCode status = U_ZERO_ERROR; + * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR + * + * USpoofChecker* sc = uspoof_open(&status); + * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status); + * + * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status); + * UBool result = bitmask != 0; + * // fails checks: 1 (status: U_ZERO_ERROR) + * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); + * uspoof_close(sc); + * \endcode + * + * Here is an example in C++ showing how to compute the restriction level of a string: + * + * \code{.cpp} + * UErrorCode status = U_ZERO_ERROR; + * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A + * + * // Get the default set of allowable characters: + * UnicodeSet allowed; + * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); + * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); + * + * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); + * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); + * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); + * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status); + * + * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); + * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); + * + * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status); + * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask: + * assert((restrictionLevel & bitmask) == restrictionLevel); + * // Restriction level: 0x50000000 (status: U_ZERO_ERROR) + * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status)); + * \endcode + * + * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since + * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. + * + * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in + * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings + * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have + * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is + * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed + * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on + * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of + * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code + * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple + * scripts. + * + * <h2>Additional Information</h2> + * + * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. + * + * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether + * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, + * using the same USpoofChecker instance. + * + * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are + * thread safe. Those that take a non-const USpoofChecker are not thread safe.. + * + * @stable ICU 4.6 + */ + +U_CDECL_BEGIN + +struct USpoofChecker; +/** + * @stable ICU 4.2 + */ +typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */ + +struct USpoofCheckResult; +/** + * @see uspoof_openCheckResult + * @stable ICU 58 + */ +typedef struct USpoofCheckResult USpoofCheckResult; + +/** + * Enum for the kinds of checks that USpoofChecker can perform. + * These enum values are used both to select the set of checks that + * will be performed, and to report results from the check function. + * + * @stable ICU 4.2 + */ +typedef enum USpoofChecks { + /** + * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates + * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section + * 4. + * + * @see uspoof_areConfusable + * @stable ICU 4.2 + */ + USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1, + + /** + * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates + * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS + * 39 section 4. + * + * @see uspoof_areConfusable + * @stable ICU 4.2 + */ + USPOOF_MIXED_SCRIPT_CONFUSABLE = 2, + + /** + * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates + * that the two strings are visually confusable and that they are not from the same script but both of them are + * single-script strings, according to UTS 39 section 4. + * + * @see uspoof_areConfusable + * @stable ICU 4.2 + */ + USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4, + + /** + * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set + * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to + * make {@link uspoof_areConfusable} return only those types of confusables. + * + * @see uspoof_areConfusable + * @see uspoof_getSkeleton + * @stable ICU 58 + */ + USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, + +#ifndef U_HIDE_DEPRECATED_API + /** + * This flag is deprecated and no longer affects the behavior of SpoofChecker. + * + * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated. + */ + USPOOF_ANY_CASE = 8, +#endif /* U_HIDE_DEPRECATED_API */ + + /** + * Check that an identifier is no looser than the specified RestrictionLevel. + * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE. + * + * If USPOOF_AUX_INFO is enabled the actual restriction level of the + * identifier being tested will also be returned by uspoof_check(). + * + * @see URestrictionLevel + * @see uspoof_setRestrictionLevel + * @see USPOOF_AUX_INFO + * + * @stable ICU 51 + */ + USPOOF_RESTRICTION_LEVEL = 16, + +#ifndef U_HIDE_DEPRECATED_API + /** Check that an identifier contains only characters from a + * single script (plus chars from the common and inherited scripts.) + * Applies to checks of a single identifier check only. + * @deprecated ICU 51 Use RESTRICTION_LEVEL instead. + */ + USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL, +#endif /* U_HIDE_DEPRECATED_API */ + + /** Check an identifier for the presence of invisible characters, + * such as zero-width spaces, or character sequences that are + * likely not to display, such as multiple occurrences of the same + * non-spacing mark. This check does not test the input string as a whole + * for conformance to any particular syntax for identifiers. + */ + USPOOF_INVISIBLE = 32, + + /** Check that an identifier contains only characters from a specified set + * of acceptable characters. See {@link uspoof_setAllowedChars} and + * {@link uspoof_setAllowedLocales}. Note that a string that fails this check + * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. + */ + USPOOF_CHAR_LIMIT = 64, + + /** + * Check that an identifier does not mix numbers from different numbering systems. + * For more information, see UTS 39 section 5.3. + * + * @stable ICU 51 + */ + USPOOF_MIXED_NUMBERS = 128, + + /** + * Check that an identifier does not have a combining character following a character in which that + * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. + * + * More specifically, the following characters are forbidden from preceding a U+0307: + * <ul> + * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li> + * <li>Latin lowercase letter 'l'</li> + * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li> + * <li>Any character whose confusable prototype ends with such a character + * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li> + * </ul> + * In addition, combining characters are allowed between the above characters and U+0307 except those + * with combining class 0 or combining class "Above" (230, same class as U+0307). + * + * This list and the number of combing characters considered by this check may grow over time. + * + * @stable ICU 62 + */ + USPOOF_HIDDEN_OVERLAY = 256, + + /** + * Enable all spoof checks. + * + * @stable ICU 4.6 + */ + USPOOF_ALL_CHECKS = 0xFFFF, + + /** + * Enable the return of auxiliary (non-error) information in the + * upper bits of the check results value. + * + * If this "check" is not enabled, the results of {@link uspoof_check} will be + * zero when an identifier passes all of the enabled checks. + * + * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will + * be zero when an identifier passes all checks. + * + * @stable ICU 51 + */ + USPOOF_AUX_INFO = 0x40000000 + + } USpoofChecks; + + + /** + * Constants from UAX #39 for use in {@link uspoof_setRestrictionLevel}, and + * for returned identifier restriction levels in check results. + * + * @stable ICU 51 + * + * @see uspoof_setRestrictionLevel + * @see uspoof_check + */ + typedef enum URestrictionLevel { + /** + * All characters in the string are in the identifier profile and all characters in the string are in the + * ASCII range. + * + * @stable ICU 51 + */ + USPOOF_ASCII = 0x10000000, + /** + * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and + * the string is single-script, according to the definition in UTS 39 section 5.1. + * + * @stable ICU 53 + */ + USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, + /** + * The string classifies as Single Script, or all characters in the string are in the identifier profile and + * the string is covered by any of the following sets of scripts, according to the definition in UTS 39 + * section 5.1: + * <ul> + * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> + * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> + * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> + * </ul> + * This is the default restriction in ICU. + * + * @stable ICU 51 + */ + USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, + /** + * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile + * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, + * Greek, and Cherokee. + * + * @stable ICU 51 + */ + USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, + /** + * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts. + * + * @stable ICU 51 + */ + USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, + /** + * Any valid identifiers, including characters outside of the Identifier Profile. + * + * @stable ICU 51 + */ + USPOOF_UNRESTRICTIVE = 0x60000000, + /** + * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}. + * + * @stable ICU 53 + */ + USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000, +#ifndef U_HIDE_INTERNAL_API + /** + * An undefined restriction level. + * @internal + */ + USPOOF_UNDEFINED_RESTRICTIVE = -1 +#endif /* U_HIDE_INTERNAL_API */ + } URestrictionLevel; + +/** + * Create a Unicode Spoof Checker, configured to perform all + * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. + * Note that additional checks may be added in the future, + * resulting in the changes to the default checking behavior. + * + * @param status The error code, set if this function encounters a problem. + * @return the newly created Spoof Checker + * @stable ICU 4.2 + */ +U_CAPI USpoofChecker * U_EXPORT2 +uspoof_open(UErrorCode *status); + + +/** + * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory. + * Inverse of uspoof_serialize(). + * The memory containing the serialized data must remain valid and unchanged + * as long as the spoof checker, or any cloned copies of the spoof checker, + * are in use. Ownership of the memory remains with the caller. + * The spoof checker (and any clones) must be closed prior to deleting the + * serialized data. + * + * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data + * @param length the number of bytes available at data; + * can be more than necessary + * @param pActualLength receives the actual number of bytes at data taken up by the data; + * can be NULL + * @param pErrorCode ICU error code + * @return the spoof checker. + * + * @see uspoof_open + * @see uspoof_serialize + * @stable ICU 4.2 + */ +U_CAPI USpoofChecker * U_EXPORT2 +uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, + UErrorCode *pErrorCode); + +/** + * Open a Spoof Checker from the source form of the spoof data. + * The input corresponds to the Unicode data file confusables.txt + * as described in Unicode UAX #39. The syntax of the source data + * is as described in UAX #39 for this file, and the content of + * this file is acceptable input. + * + * The character encoding of the (char *) input text is UTF-8. + * + * @param confusables a pointer to the confusable characters definitions, + * as found in file confusables.txt from unicode.org. + * @param confusablesLen The length of the confusables text, or -1 if the + * input string is zero terminated. + * @param confusablesWholeScript + * Deprecated in ICU 58. No longer used. + * @param confusablesWholeScriptLen + * Deprecated in ICU 58. No longer used. + * @param errType In the event of an error in the input, indicates + * which of the input files contains the error. + * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or + * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or + * zero if no errors are found. + * @param pe In the event of an error in the input, receives the position + * in the input text (line, offset) of the error. + * @param status an in/out ICU UErrorCode. Among the possible errors is + * U_PARSE_ERROR, which is used to report syntax errors + * in the input. + * @return A spoof checker that uses the rules from the input files. + * @stable ICU 4.2 + */ +U_CAPI USpoofChecker * U_EXPORT2 +uspoof_openFromSource(const char *confusables, int32_t confusablesLen, + const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, + int32_t *errType, UParseError *pe, UErrorCode *status); + + +/** + * Close a Spoof Checker, freeing any memory that was being held by + * its implementation. + * @stable ICU 4.2 + */ +U_CAPI void U_EXPORT2 +uspoof_close(USpoofChecker *sc); + +/** + * Clone a Spoof Checker. The clone will be set to perform the same checks + * as the original source. + * + * @param sc The source USpoofChecker + * @param status The error code, set if this function encounters a problem. + * @return + * @stable ICU 4.2 + */ +U_CAPI USpoofChecker * U_EXPORT2 +uspoof_clone(const USpoofChecker *sc, UErrorCode *status); + + +/** + * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method + * overwrites any checks that may have already been enabled. By default, all checks are enabled. + * + * To enable specific checks and disable all others, + * OR together only the bit constants for the desired checks. + * For example, to fail strings containing characters outside of + * the set specified by {@link uspoof_setAllowedChars} and + * also strings that contain digits from mixed numbering systems: + * + * <pre> + * {@code + * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS); + * } + * </pre> + * + * To disable specific checks and enable all others, + * start with ALL_CHECKS and "AND away" the not-desired checks. + * For example, if you are not planning to use the {@link uspoof_areConfusable} functionality, + * it is good practice to disable the CONFUSABLE check: + * + * <pre> + * {@code + * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE); + * } + * </pre> + * + * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and + * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they + * enable onto the existing bitmask specified by this method. For more details, see the documentation of those + * methods. + * + * @param sc The USpoofChecker + * @param checks The set of checks that this spoof checker will perform. + * The value is a bit set, obtained by OR-ing together + * values from enum USpoofChecks. + * @param status The error code, set if this function encounters a problem. + * @stable ICU 4.2 + * + */ +U_CAPI void U_EXPORT2 +uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); + +/** + * Get the set of checks that this Spoof Checker has been configured to perform. + * + * @param sc The USpoofChecker + * @param status The error code, set if this function encounters a problem. + * @return The set of checks that this spoof checker will perform. + * The value is a bit set, obtained by OR-ing together + * values from enum USpoofChecks. + * @stable ICU 4.2 + * + */ +U_CAPI int32_t U_EXPORT2 +uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); + +/** + * Set the loosest restriction level allowed for strings. The default if this is not called is + * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and + * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are + * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. + * + * @param sc The USpoofChecker + * @param restrictionLevel The loosest restriction level allowed. + * @see URestrictionLevel + * @stable ICU 51 + */ +U_CAPI void U_EXPORT2 +uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); + + +/** + * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}. + * + * @return The restriction level + * @see URestrictionLevel + * @stable ICU 51 + */ +U_CAPI URestrictionLevel U_EXPORT2 +uspoof_getRestrictionLevel(const USpoofChecker *sc); + +/** + * Limit characters that are acceptable in identifiers being checked to those + * normally used with the languages associated with the specified locales. + * Any previously specified list of locales is replaced by the new settings. + * + * A set of languages is determined from the locale(s), and + * from those a set of acceptable Unicode scripts is determined. + * Characters from this set of scripts, along with characters from + * the "common" and "inherited" Unicode Script categories + * will be permitted. + * + * Supplying an empty string removes all restrictions; + * characters from any script will be allowed. + * + * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this + * USpoofChecker when calling this function with a non-empty list + * of locales. + * + * The Unicode Set of characters that will be allowed is accessible + * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales() + * will <i>replace</i> any previously applied set of allowed characters. + * + * Adjustments, such as additions or deletions of certain classes of characters, + * can be made to the result of uspoof_setAllowedLocales() by + * fetching the resulting set with uspoof_getAllowedChars(), + * manipulating it with the Unicode Set API, then resetting the + * spoof detectors limits with uspoof_setAllowedChars(). + * + * @param sc The USpoofChecker + * @param localesList A list list of locales, from which the language + * and associated script are extracted. The locales + * are comma-separated if there is more than one. + * White space may not appear within an individual locale, + * but is ignored otherwise. + * The locales are syntactically like those from the + * HTTP Accept-Language header. + * If the localesList is empty, no restrictions will be placed on + * the allowed characters. + * + * @param status The error code, set if this function encounters a problem. + * @stable ICU 4.2 + */ +U_CAPI void U_EXPORT2 +uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status); + +/** + * Get a list of locales for the scripts that are acceptable in strings + * to be checked. If no limitations on scripts have been specified, + * an empty string will be returned. + * + * uspoof_setAllowedChars() will reset the list of allowed to be empty. + * + * The format of the returned list is the same as that supplied to + * uspoof_setAllowedLocales(), but returned list may not be identical + * to the originally specified string; the string may be reformatted, + * and information other than languages from + * the originally specified locales may be omitted. + * + * @param sc The USpoofChecker + * @param status The error code, set if this function encounters a problem. + * @return A string containing a list of locales corresponding + * to the acceptable scripts, formatted like an + * HTTP Accept Language value. + * + * @stable ICU 4.2 + */ +U_CAPI const char * U_EXPORT2 +uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status); + + +/** + * Limit the acceptable characters to those specified by a Unicode Set. + * Any previously specified character limit is + * is replaced by the new settings. This includes limits on + * characters that were set with the uspoof_setAllowedLocales() function. + * + * The USPOOF_CHAR_LIMIT test is automatically enabled for this + * USpoofChecker by this function. + * + * @param sc The USpoofChecker + * @param chars A Unicode Set containing the list of + * characters that are permitted. Ownership of the set + * remains with the caller. The incoming set is cloned by + * this function, so there are no restrictions on modifying + * or deleting the USet after calling this function. + * @param status The error code, set if this function encounters a problem. + * @stable ICU 4.2 + */ +U_CAPI void U_EXPORT2 +uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status); + + +/** + * Get a USet for the characters permitted in an identifier. + * This corresponds to the limits imposed by the Set Allowed Characters + * functions. Limitations imposed by other checks will not be + * reflected in the set returned by this function. + * + * The returned set will be frozen, meaning that it cannot be modified + * by the caller. + * + * Ownership of the returned set remains with the Spoof Detector. The + * returned set will become invalid if the spoof detector is closed, + * or if a new set of allowed characters is specified. + * + * + * @param sc The USpoofChecker + * @param status The error code, set if this function encounters a problem. + * @return A USet containing the characters that are permitted by + * the USPOOF_CHAR_LIMIT test. + * @stable ICU 4.2 + */ +U_CAPI const USet * U_EXPORT2 +uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status); + + +/** + * Check the specified string for possible security issues. + * The text to be checked will typically be an identifier of some sort. + * The set of checks to be performed is specified with uspoof_setChecks(). + * + * \note + * Consider using the newer API, {@link uspoof_check2}, instead. + * The newer API exposes additional information from the check procedure + * and is otherwise identical to this method. + * + * @param sc The USpoofChecker + * @param id The identifier to be checked for possible security issues, + * in UTF-16 format. + * @param length the length of the string to be checked, expressed in + * 16 bit UTF-16 code units, or -1 if the string is + * zero terminated. + * @param position Deprecated in ICU 51. Always returns zero. + * Originally, an out parameter for the index of the first + * string position that failed a check. + * This parameter may be NULL. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Spoofing or security issues detected with the input string are + * not reported here, but through the function's return value. + * @return An integer value with bits set for any potential security + * or spoofing issues detected. The bits are defined by + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. + * @see uspoof_check2 + * @stable ICU 4.2 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_check(const USpoofChecker *sc, + const UChar *id, int32_t length, + int32_t *position, + UErrorCode *status); + + +/** + * Check the specified string for possible security issues. + * The text to be checked will typically be an identifier of some sort. + * The set of checks to be performed is specified with uspoof_setChecks(). + * + * \note + * Consider using the newer API, {@link uspoof_check2UTF8}, instead. + * The newer API exposes additional information from the check procedure + * and is otherwise identical to this method. + * + * @param sc The USpoofChecker + * @param id A identifier to be checked for possible security issues, in UTF8 format. + * @param length the length of the string to be checked, or -1 if the string is + * zero terminated. + * @param position Deprecated in ICU 51. Always returns zero. + * Originally, an out parameter for the index of the first + * string position that failed a check. + * This parameter may be NULL. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Spoofing or security issues detected with the input string are + * not reported here, but through the function's return value. + * If the input contains invalid UTF-8 sequences, + * a status of U_INVALID_CHAR_FOUND will be returned. + * @return An integer value with bits set for any potential security + * or spoofing issues detected. The bits are defined by + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. + * @see uspoof_check2UTF8 + * @stable ICU 4.2 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_checkUTF8(const USpoofChecker *sc, + const char *id, int32_t length, + int32_t *position, + UErrorCode *status); + + +/** + * Check the specified string for possible security issues. + * The text to be checked will typically be an identifier of some sort. + * The set of checks to be performed is specified with uspoof_setChecks(). + * + * @param sc The USpoofChecker + * @param id The identifier to be checked for possible security issues, + * in UTF-16 format. + * @param length the length of the string to be checked, or -1 if the string is + * zero terminated. + * @param checkResult An instance of USpoofCheckResult to be filled with + * details about the identifier. Can be NULL. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Spoofing or security issues detected with the input string are + * not reported here, but through the function's return value. + * @return An integer value with bits set for any potential security + * or spoofing issues detected. The bits are defined by + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. Any information in this bitmask will be + * consistent with the information saved in the optional + * checkResult parameter. + * @see uspoof_openCheckResult + * @see uspoof_check2UTF8 + * @see uspoof_check2UnicodeString + * @stable ICU 58 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_check2(const USpoofChecker *sc, + const UChar* id, int32_t length, + USpoofCheckResult* checkResult, + UErrorCode *status); + +/** + * Check the specified string for possible security issues. + * The text to be checked will typically be an identifier of some sort. + * The set of checks to be performed is specified with uspoof_setChecks(). + * + * This version of {@link uspoof_check} accepts a USpoofCheckResult, which + * returns additional information about the identifier. For more + * information, see {@link uspoof_openCheckResult}. + * + * @param sc The USpoofChecker + * @param id A identifier to be checked for possible security issues, in UTF8 format. + * @param length the length of the string to be checked, or -1 if the string is + * zero terminated. + * @param checkResult An instance of USpoofCheckResult to be filled with + * details about the identifier. Can be NULL. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Spoofing or security issues detected with the input string are + * not reported here, but through the function's return value. + * @return An integer value with bits set for any potential security + * or spoofing issues detected. The bits are defined by + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. Any information in this bitmask will be + * consistent with the information saved in the optional + * checkResult parameter. + * @see uspoof_openCheckResult + * @see uspoof_check2 + * @see uspoof_check2UnicodeString + * @stable ICU 58 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_check2UTF8(const USpoofChecker *sc, + const char *id, int32_t length, + USpoofCheckResult* checkResult, + UErrorCode *status); + +/** + * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return + * information about the identifier. Information includes: + * <ul> + * <li>A bitmask of the checks that failed</li> + * <li>The identifier's restriction level (UTS 39 section 5.2)</li> + * <li>The set of numerics in the string (UTS 39 section 5.3)</li> + * </ul> + * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call + * of {@link uspoof_check2}. + * + * @param status The error code, set if this function encounters a problem. + * @return the newly created USpoofCheckResult + * @see uspoof_check2 + * @see uspoof_check2UTF8 + * @see uspoof_check2UnicodeString + * @stable ICU 58 + */ +U_CAPI USpoofCheckResult* U_EXPORT2 +uspoof_openCheckResult(UErrorCode *status); + +/** + * Close a USpoofCheckResult, freeing any memory that was being held by + * its implementation. + * + * @param checkResult The instance of USpoofCheckResult to close + * @stable ICU 58 + */ +U_CAPI void U_EXPORT2 +uspoof_closeCheckResult(USpoofCheckResult *checkResult); + +/** + * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests + * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on. + * + * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} + * @param status The error code, set if an error occurred. + * @return An integer value with bits set for any potential security + * or spoofing issues detected. The bits are defined by + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. + * @see uspoof_setChecks + * @stable ICU 58 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status); + +/** + * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check + * was enabled; otherwise, undefined. + * + * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} + * @param status The error code, set if an error occurred. + * @return The restriction level contained in the USpoofCheckResult + * @see uspoof_setRestrictionLevel + * @stable ICU 58 + */ +U_CAPI URestrictionLevel U_EXPORT2 +uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status); + +/** + * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled; + * otherwise, undefined. The set will contain the zero digit from each decimal number system found + * in the input string. Ownership of the returned USet remains with the USpoofCheckResult. + * The USet will be free'd when {@link uspoof_closeCheckResult} is called. + * + * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} + * @return The set of numerics contained in the USpoofCheckResult + * @param status The error code, set if an error occurred. + * @stable ICU 58 + */ +U_CAPI const USet* U_EXPORT2 +uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status); + + +/** + * Check the whether two specified strings are visually confusable. + * + * If the strings are confusable, the return value will be nonzero, as long as + * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). + * + * The bits in the return value correspond to flags for each of the classes of + * confusables applicable to the two input strings. According to UTS 39 + * section 4, the possible flags are: + * + * <ul> + * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> + * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li> + * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> + * </ul> + * + * If one or more of the above flags were not listed in uspoof_setChecks(), this + * function will never report that class of confusable. The check + * {@link USPOOF_CONFUSABLE} enables all three flags. + * + * + * @param sc The USpoofChecker + * @param id1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-16 format. + * @param length1 the length of the first identifier, expressed in + * 16 bit UTF-16 code units, or -1 if the string is + * nul terminated. + * @param id2 The second of the two identifiers to be compared for + * confusability. The identifiers are in UTF-16 format. + * @param length2 The length of the second identifiers, expressed in + * 16 bit UTF-16 code units, or -1 if the string is + * nul terminated. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Confusability of the identifiers is not reported here, + * but through this function's return value. + * @return An integer value with bit(s) set corresponding to + * the type of confusability found, as defined by + * enum USpoofChecks. Zero is returned if the identifiers + * are not confusable. + * + * @stable ICU 4.2 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusable(const USpoofChecker *sc, + const UChar *id1, int32_t length1, + const UChar *id2, int32_t length2, + UErrorCode *status); + + + +/** + * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format. + * + * @param sc The USpoofChecker + * @param id1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param length1 the length of the first identifiers, in bytes, or -1 + * if the string is nul terminated. + * @param id2 The second of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param length2 The length of the second string in bytes, or -1 + * if the string is nul terminated. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Confusability of the strings is not reported here, + * but through this function's return value. + * @return An integer value with bit(s) set corresponding to + * the type of confusability found, as defined by + * enum USpoofChecks. Zero is returned if the strings + * are not confusable. + * + * @stable ICU 4.2 + * + * @see uspoof_areConfusable + */ +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusableUTF8(const USpoofChecker *sc, + const char *id1, int32_t length1, + const char *id2, int32_t length2, + UErrorCode *status); + + + + +/** + * Get the "skeleton" for an identifier. + * Skeletons are a transformation of the input identifier; + * Two identifiers are confusable if their skeletons are identical. + * See Unicode UAX #39 for additional information. + * + * Using skeletons directly makes it possible to quickly check + * whether an identifier is confusable with any of some large + * set of existing identifiers, by creating an efficiently + * searchable collection of the skeletons. + * + * @param sc The USpoofChecker + * @param type Deprecated in ICU 58. You may pass any number. + * Originally, controlled which of the Unicode confusable data + * tables to use. + * @param id The input identifier whose skeleton will be computed. + * @param length The length of the input identifier, expressed in 16 bit + * UTF-16 code units, or -1 if the string is zero terminated. + * @param dest The output buffer, to receive the skeleton string. + * @param destCapacity The length of the output buffer, in 16 bit units. + * The destCapacity may be zero, in which case the function will + * return the actual length of the skeleton. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * @return The length of the skeleton string. The returned length + * is always that of the complete skeleton, even when the + * supplied buffer is too small (or of zero length) + * + * @stable ICU 4.2 + * @see uspoof_areConfusable + */ +U_CAPI int32_t U_EXPORT2 +uspoof_getSkeleton(const USpoofChecker *sc, + uint32_t type, + const UChar *id, int32_t length, + UChar *dest, int32_t destCapacity, + UErrorCode *status); + +/** + * Get the "skeleton" for an identifier. + * Skeletons are a transformation of the input identifier; + * Two identifiers are confusable if their skeletons are identical. + * See Unicode UAX #39 for additional information. + * + * Using skeletons directly makes it possible to quickly check + * whether an identifier is confusable with any of some large + * set of existing identifiers, by creating an efficiently + * searchable collection of the skeletons. + * + * @param sc The USpoofChecker + * @param type Deprecated in ICU 58. You may pass any number. + * Originally, controlled which of the Unicode confusable data + * tables to use. + * @param id The UTF-8 format identifier whose skeleton will be computed. + * @param length The length of the input string, in bytes, + * or -1 if the string is zero terminated. + * @param dest The output buffer, to receive the skeleton string. + * @param destCapacity The length of the output buffer, in bytes. + * The destCapacity may be zero, in which case the function will + * return the actual length of the skeleton. + * @param status The error code, set if an error occurred while attempting to + * perform the check. Possible Errors include U_INVALID_CHAR_FOUND + * for invalid UTF-8 sequences, and + * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small + * to hold the complete skeleton. + * @return The length of the skeleton string, in bytes. The returned length + * is always that of the complete skeleton, even when the + * supplied buffer is too small (or of zero length) + * + * @stable ICU 4.2 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_getSkeletonUTF8(const USpoofChecker *sc, + uint32_t type, + const char *id, int32_t length, + char *dest, int32_t destCapacity, + UErrorCode *status); + +/** + * Get the set of Candidate Characters for Inclusion in Identifiers, as defined + * in http://unicode.org/Public/security/latest/xidmodifications.txt + * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @stable ICU 51 + */ +U_CAPI const USet * U_EXPORT2 +uspoof_getInclusionSet(UErrorCode *status); + +/** + * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined + * in http://unicode.org/Public/security/latest/xidmodifications.txt + * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @stable ICU 51 + */ +U_CAPI const USet * U_EXPORT2 +uspoof_getRecommendedSet(UErrorCode *status); + +/** + * Serialize the data for a spoof detector into a chunk of memory. + * The flattened spoof detection tables can later be used to efficiently + * instantiate a new Spoof Detector. + * + * The serialized spoof checker includes only the data compiled from the + * Unicode data tables by uspoof_openFromSource(); it does not include + * include any other state or configuration that may have been set. + * + * @param sc the Spoof Detector whose data is to be serialized. + * @param data a pointer to 32-bit-aligned memory to be filled with the data, + * can be NULL if capacity==0 + * @param capacity the number of bytes available at data, + * or 0 for preflighting + * @param status an in/out ICU UErrorCode; possible errors include: + * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization + * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad + * @return the number of bytes written or needed for the spoof data + * + * @see utrie2_openFromSerialized() + * @stable ICU 4.2 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_serialize(USpoofChecker *sc, + void *data, int32_t capacity, + UErrorCode *status); + +U_CDECL_END + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalUSpoofCheckerPointer + * "Smart pointer" class, closes a USpoofChecker via uspoof_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 4.4 + */ +/** + * \cond + * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER. + * For now, suppress with a Doxygen cond + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close); +/** \endcond */ + +/** + * \class LocalUSpoofCheckResultPointer + * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`. + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @stable ICU 58 + */ + +/** + * \cond + * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER. + * For now, suppress with a Doxygen cond + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult); +/** \endcond */ + +U_NAMESPACE_END + +/** + * Limit the acceptable characters to those specified by a Unicode Set. + * Any previously specified character limit is + * is replaced by the new settings. This includes limits on + * characters that were set with the uspoof_setAllowedLocales() function. + * + * The USPOOF_CHAR_LIMIT test is automatically enabled for this + * USoofChecker by this function. + * + * @param sc The USpoofChecker + * @param chars A Unicode Set containing the list of + * characters that are permitted. Ownership of the set + * remains with the caller. The incoming set is cloned by + * this function, so there are no restrictions on modifying + * or deleting the UnicodeSet after calling this function. + * @param status The error code, set if this function encounters a problem. + * @stable ICU 4.2 + */ +U_CAPI void U_EXPORT2 +uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status); + + +/** + * Get a UnicodeSet for the characters permitted in an identifier. + * This corresponds to the limits imposed by the Set Allowed Characters / + * UnicodeSet functions. Limitations imposed by other checks will not be + * reflected in the set returned by this function. + * + * The returned set will be frozen, meaning that it cannot be modified + * by the caller. + * + * Ownership of the returned set remains with the Spoof Detector. The + * returned set will become invalid if the spoof detector is closed, + * or if a new set of allowed characters is specified. + * + * + * @param sc The USpoofChecker + * @param status The error code, set if this function encounters a problem. + * @return A UnicodeSet containing the characters that are permitted by + * the USPOOF_CHAR_LIMIT test. + * @stable ICU 4.2 + */ +U_CAPI const icu::UnicodeSet * U_EXPORT2 +uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status); + +/** + * Check the specified string for possible security issues. + * The text to be checked will typically be an identifier of some sort. + * The set of checks to be performed is specified with uspoof_setChecks(). + * + * \note + * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead. + * The newer API exposes additional information from the check procedure + * and is otherwise identical to this method. + * + * @param sc The USpoofChecker + * @param id A identifier to be checked for possible security issues. + * @param position Deprecated in ICU 51. Always returns zero. + * Originally, an out parameter for the index of the first + * string position that failed a check. + * This parameter may be NULL. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Spoofing or security issues detected with the input string are + * not reported here, but through the function's return value. + * @return An integer value with bits set for any potential security + * or spoofing issues detected. The bits are defined by + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. + * @see uspoof_check2UnicodeString + * @stable ICU 4.2 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_checkUnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &id, + int32_t *position, + UErrorCode *status); + +/** + * Check the specified string for possible security issues. + * The text to be checked will typically be an identifier of some sort. + * The set of checks to be performed is specified with uspoof_setChecks(). + * + * @param sc The USpoofChecker + * @param id A identifier to be checked for possible security issues. + * @param checkResult An instance of USpoofCheckResult to be filled with + * details about the identifier. Can be NULL. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Spoofing or security issues detected with the input string are + * not reported here, but through the function's return value. + * @return An integer value with bits set for any potential security + * or spoofing issues detected. The bits are defined by + * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) + * will be zero if the input string passes all of the + * enabled checks. Any information in this bitmask will be + * consistent with the information saved in the optional + * checkResult parameter. + * @see uspoof_openCheckResult + * @see uspoof_check2 + * @see uspoof_check2UTF8 + * @stable ICU 58 + */ +U_CAPI int32_t U_EXPORT2 +uspoof_check2UnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &id, + USpoofCheckResult* checkResult, + UErrorCode *status); + +/** + * A version of {@link uspoof_areConfusable} accepting UnicodeStrings. + * + * @param sc The USpoofChecker + * @param s1 The first of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param s2 The second of the two identifiers to be compared for + * confusability. The strings are in UTF-8 format. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * Confusability of the identifiers is not reported here, + * but through this function's return value. + * @return An integer value with bit(s) set corresponding to + * the type of confusability found, as defined by + * enum USpoofChecks. Zero is returned if the identifiers + * are not confusable. + * + * @stable ICU 4.2 + * + * @see uspoof_areConfusable + */ +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusableUnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &s1, + const icu::UnicodeString &s2, + UErrorCode *status); + +/** + * Get the "skeleton" for an identifier. + * Skeletons are a transformation of the input identifier; + * Two identifiers are confusable if their skeletons are identical. + * See Unicode UAX #39 for additional information. + * + * Using skeletons directly makes it possible to quickly check + * whether an identifier is confusable with any of some large + * set of existing identifiers, by creating an efficiently + * searchable collection of the skeletons. + * + * @param sc The USpoofChecker. + * @param type Deprecated in ICU 58. You may pass any number. + * Originally, controlled which of the Unicode confusable data + * tables to use. + * @param id The input identifier whose skeleton will be computed. + * @param dest The output identifier, to receive the skeleton string. + * @param status The error code, set if an error occurred while attempting to + * perform the check. + * @return A reference to the destination (skeleton) string. + * + * @stable ICU 4.2 + */ +U_I18N_API icu::UnicodeString & U_EXPORT2 +uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, + uint32_t type, + const icu::UnicodeString &id, + icu::UnicodeString &dest, + UErrorCode *status); + +/** + * Get the set of Candidate Characters for Inclusion in Identifiers, as defined + * in http://unicode.org/Public/security/latest/xidmodifications.txt + * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @stable ICU 51 + */ +U_CAPI const icu::UnicodeSet * U_EXPORT2 +uspoof_getInclusionUnicodeSet(UErrorCode *status); + +/** + * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined + * in http://unicode.org/Public/security/latest/xidmodifications.txt + * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. + * + * The returned set is frozen. Ownership of the set remains with the ICU library; it must not + * be deleted by the caller. + * + * @param status The error code, set if a problem occurs while creating the set. + * + * @stable ICU 51 + */ +U_CAPI const icu::UnicodeSet * U_EXPORT2 +uspoof_getRecommendedUnicodeSet(UErrorCode *status); + +#endif /* U_SHOW_CPLUSPLUS_API */ + +#endif /* UCONFIG_NO_NORMALIZATION */ + +#endif /* USPOOF_H */ diff --git a/thirdparty/icu4c/i18n/uspoof.cpp b/thirdparty/icu4c/i18n/uspoof.cpp new file mode 100644 index 0000000000..dd4618baa7 --- /dev/null +++ b/thirdparty/icu4c/i18n/uspoof.cpp @@ -0,0 +1,839 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +*************************************************************************** +* Copyright (C) 2008-2015, International Business Machines Corporation +* and others. All Rights Reserved. +*************************************************************************** +* file name: uspoof.cpp +* encoding: UTF-8 +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2008Feb13 +* created by: Andy Heninger +* +* Unicode Spoof Detection +*/ +#include "unicode/utypes.h" +#include "unicode/normalizer2.h" +#include "unicode/uspoof.h" +#include "unicode/ustring.h" +#include "unicode/utf16.h" +#include "cmemory.h" +#include "cstring.h" +#include "mutex.h" +#include "scriptset.h" +#include "uassert.h" +#include "ucln_in.h" +#include "uspoof_impl.h" +#include "umutex.h" + + +#if !UCONFIG_NO_NORMALIZATION + +U_NAMESPACE_USE + + +// +// Static Objects used by the spoof impl, their thread safe initialization and their cleanup. +// +static UnicodeSet *gInclusionSet = NULL; +static UnicodeSet *gRecommendedSet = NULL; +static const Normalizer2 *gNfdNormalizer = NULL; +static UInitOnce gSpoofInitStaticsOnce = U_INITONCE_INITIALIZER; + +namespace { + +UBool U_CALLCONV +uspoof_cleanup(void) { + delete gInclusionSet; + gInclusionSet = NULL; + delete gRecommendedSet; + gRecommendedSet = NULL; + gNfdNormalizer = NULL; + gSpoofInitStaticsOnce.reset(); + return TRUE; +} + +void U_CALLCONV initializeStatics(UErrorCode &status) { + static const char16_t *inclusionPat = + u"['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C" + u"\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]"; + gInclusionSet = new UnicodeSet(UnicodeString(inclusionPat), status); + if (gInclusionSet == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + gInclusionSet->freeze(); + + // Note: data from IdentifierStatus.txt & IdentifierType.txt + // There is tooling to generate this constant in the unicodetools project: + // org.unicode.text.tools.RecommendedSetGenerator + // It will print the Java and C++ code to the console for easy copy-paste into this file. + static const char16_t *recommendedPat = + u"[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E" + u"\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-" + u"\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E" + u"\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\\u0306-" + u"\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u0328\\u032D\\u032E" + u"\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386" + u"\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-" + u"\\u04FF\\u0510-\\u0529\\u052E\\u052F\\u0531-\\u0556\\u0559\\u0561-\\u0586" + u"\\u05B4\\u05D0-\\u05EA\\u05EF-\\u05F2\\u0620-\\u063F\\u0641-\\u0655\\u0660-" + u"\\u0669\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06A0\\u06A2-\\u06D3" + u"\\u06D5\\u06E5\\u06E6\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u0870-\\u0887" + u"\\u0889-\\u088E\\u08A0-\\u08AC\\u08B2\\u08B5-\\u08C9\\u0901-\\u094D\\u094F" + u"\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096F\\u0971-\\u0977\\u0979-" + u"\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-" + u"\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CE" + u"\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u09FE\\u0A01-\\u0A03\\u0A05-\\u0A0A" + u"\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A35\\u0A38\\u0A39" + u"\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B-\\u0A4D\\u0A5C\\u0A66-\\u0A74" + u"\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0" + u"\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD" + u"\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0AFA-\\u0AFF\\u0B01-\\u0B03\\u0B05-" + u"\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B35-" + u"\\u0B39\\u0B3C-\\u0B43\\u0B47\\u0B48\\u0B4B-\\u0B4D\\u0B55-\\u0B57\\u0B5F-" + u"\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90" + u"\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-" + u"\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD0" + u"\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-" + u"\\u0C33\\u0C35-\\u0C39\\u0C3C-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55" + u"\\u0C56\\u0C5D\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80\\u0C82\\u0C83\\u0C85-" + u"\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBC-" + u"\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5\\u0CD6\\u0CDD\\u0CE0-\\u0CE3" + u"\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D00\\u0D02\\u0D03\\u0D05-\\u0D0C\\u0D0E-" + u"\\u0D10\\u0D12-\\u0D3A\\u0D3D-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-" + u"\\u0D57\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-" + u"\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD" + u"\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-" + u"\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59\\u0E81\\u0E82\\u0E84" + u"\\u0E86-\\u0E8A\\u0E8C-\\u0EA3\\u0EA5\\u0EA7-\\u0EB2\\u0EB4-\\u0EBD\\u0EC0-" + u"\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00\\u0F20-" + u"\\u0F29\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C\\u0F4E-" + u"\\u0F51\\u0F53-\\u0F56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C\\u0F71" + u"\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0F82-\\u0F84\\u0F86-\\u0F92\\u0F94-\\u0F97" + u"\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6\\u0FA8-\\u0FAB\\u0FAD-\\u0FB8" + u"\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10C7\\u10CD\\u10D0-" + u"\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D\\u1250-" + u"\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0" + u"\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-" + u"\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F\\u1780-" + u"\\u17A2\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CD\\u17D0\\u17D2\\u17D7" + u"\\u17DC\\u17E0-\\u17E9\\u1C90-\\u1CBA\\u1CBD-\\u1CBF\\u1E00-\\u1E99\\u1E9E" + u"\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D" + u"\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72\\u1F74\\u1F76" + u"\\u1F78\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC\\u1FC2-\\u1FC4" + u"\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA\\u1FE0-\\u1FE2" + u"\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA\\u1FFC\\u2D27" + u"\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-" + u"\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u3005-" + u"\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-\\u30FA\\u30FC-" + u"\\u30FE\\u3105-\\u312D\\u312F\\u31A0-\\u31BF\\u3400-\\u4DBF\\u4E00-\\u9FFF" + u"\\uA67F\\uA717-\\uA71F\\uA788\\uA78D\\uA792\\uA793\\uA7AA\\uA7AE\\uA7B8" + u"\\uA7B9\\uA7C0-\\uA7CA\\uA7D0\\uA7D1\\uA7D3\\uA7D5-\\uA7D9\\uA9E7-\\uA9FE" + u"\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16" + u"\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAB66\\uAB67\\uAC00-\\uD7A3\\uFA0E\\uFA0F" + u"\\uFA11\\uFA13\\uFA14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29\\U00011301" + u"\\U00011303\\U0001133B\\U0001133C\\U00016FF0\\U00016FF1\\U0001B11F-" + u"\\U0001B122\\U0001B150-\\U0001B152\\U0001B164-\\U0001B167\\U0001DF00-" + u"\\U0001DF1E\\U0001E7E0-\\U0001E7E6\\U0001E7E8-\\U0001E7EB\\U0001E7ED" + u"\\U0001E7EE\\U0001E7F0-\\U0001E7FE\\U00020000-\\U0002A6DF\\U0002A700-" + u"\\U0002B738\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-" + u"\\U0002EBE0\\U00030000-\\U0003134A]"; + + gRecommendedSet = new UnicodeSet(UnicodeString(recommendedPat), status); + if (gRecommendedSet == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + delete gInclusionSet; + return; + } + gRecommendedSet->freeze(); + gNfdNormalizer = Normalizer2::getNFDInstance(status); + ucln_i18n_registerCleanup(UCLN_I18N_SPOOF, uspoof_cleanup); +} + +} // namespace + +U_CFUNC void uspoof_internalInitStatics(UErrorCode *status) { + umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status); +} + +U_CAPI USpoofChecker * U_EXPORT2 +uspoof_open(UErrorCode *status) { + umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status); + if (U_FAILURE(*status)) { + return NULL; + } + SpoofImpl *si = new SpoofImpl(*status); + if (si == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + if (U_FAILURE(*status)) { + delete si; + return NULL; + } + return si->asUSpoofChecker(); +} + + +U_CAPI USpoofChecker * U_EXPORT2 +uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, + UErrorCode *status) { + if (U_FAILURE(*status)) { + return NULL; + } + + if (data == NULL) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + + umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status); + if (U_FAILURE(*status)) + { + return NULL; + } + + SpoofData *sd = new SpoofData(data, length, *status); + if (sd == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + + if (U_FAILURE(*status)) { + delete sd; + return NULL; + } + + SpoofImpl *si = new SpoofImpl(sd, *status); + if (si == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + delete sd; // explicit delete as the destructor for si won't be called. + return NULL; + } + + if (U_FAILURE(*status)) { + delete si; // no delete for sd, as the si destructor will delete it. + return NULL; + } + + if (pActualLength != NULL) { + *pActualLength = sd->size(); + } + return si->asUSpoofChecker(); +} + + +U_CAPI USpoofChecker * U_EXPORT2 +uspoof_clone(const USpoofChecker *sc, UErrorCode *status) { + const SpoofImpl *src = SpoofImpl::validateThis(sc, *status); + if (src == NULL) { + return NULL; + } + SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor + if (result == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + if (U_FAILURE(*status)) { + delete result; + result = NULL; + } + return result->asUSpoofChecker(); +} + + +U_CAPI void U_EXPORT2 +uspoof_close(USpoofChecker *sc) { + UErrorCode status = U_ZERO_ERROR; + SpoofImpl *This = SpoofImpl::validateThis(sc, status); + delete This; +} + + +U_CAPI void U_EXPORT2 +uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) { + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return; + } + + // Verify that the requested checks are all ones (bits) that + // are acceptable, known values. + if (checks & ~(USPOOF_ALL_CHECKS | USPOOF_AUX_INFO)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + + This->fChecks = checks; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return 0; + } + return This->fChecks; +} + +U_CAPI void U_EXPORT2 +uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel) { + UErrorCode status = U_ZERO_ERROR; + SpoofImpl *This = SpoofImpl::validateThis(sc, status); + if (This != NULL) { + This->fRestrictionLevel = restrictionLevel; + This->fChecks |= USPOOF_RESTRICTION_LEVEL; + } +} + +U_CAPI URestrictionLevel U_EXPORT2 +uspoof_getRestrictionLevel(const USpoofChecker *sc) { + UErrorCode status = U_ZERO_ERROR; + const SpoofImpl *This = SpoofImpl::validateThis(sc, status); + if (This == NULL) { + return USPOOF_UNRESTRICTIVE; + } + return This->fRestrictionLevel; +} + +U_CAPI void U_EXPORT2 +uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status) { + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return; + } + This->setAllowedLocales(localesList, *status); +} + +U_CAPI const char * U_EXPORT2 +uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status) { + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return NULL; + } + return This->getAllowedLocales(*status); +} + + +U_CAPI const USet * U_EXPORT2 +uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) { + const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status); + return result->toUSet(); +} + +U_CAPI const UnicodeSet * U_EXPORT2 +uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return NULL; + } + return This->fAllowedCharsSet; +} + + +U_CAPI void U_EXPORT2 +uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) { + const UnicodeSet *set = UnicodeSet::fromUSet(chars); + uspoof_setAllowedUnicodeSet(sc, set, status); +} + + +U_CAPI void U_EXPORT2 +uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) { + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return; + } + if (chars->isBogus()) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + UnicodeSet *clonedSet = chars->clone(); + if (clonedSet == NULL || clonedSet->isBogus()) { + *status = U_MEMORY_ALLOCATION_ERROR; + return; + } + clonedSet->freeze(); + delete This->fAllowedCharsSet; + This->fAllowedCharsSet = clonedSet; + This->fChecks |= USPOOF_CHAR_LIMIT; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_check(const USpoofChecker *sc, + const UChar *id, int32_t length, + int32_t *position, + UErrorCode *status) { + + // Backwards compatibility: + if (position != NULL) { + *position = 0; + } + + // Delegate to uspoof_check2 + return uspoof_check2(sc, id, length, NULL, status); +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_check2(const USpoofChecker *sc, + const UChar* id, int32_t length, + USpoofCheckResult* checkResult, + UErrorCode *status) { + + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return 0; + } + if (length < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + UnicodeString idStr((length == -1), id, length); // Aliasing constructor. + int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status); + return result; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_checkUTF8(const USpoofChecker *sc, + const char *id, int32_t length, + int32_t *position, + UErrorCode *status) { + + // Backwards compatibility: + if (position != NULL) { + *position = 0; + } + + // Delegate to uspoof_check2 + return uspoof_check2UTF8(sc, id, length, NULL, status); +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_check2UTF8(const USpoofChecker *sc, + const char *id, int32_t length, + USpoofCheckResult* checkResult, + UErrorCode *status) { + + if (U_FAILURE(*status)) { + return 0; + } + UnicodeString idStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : static_cast<int32_t>(uprv_strlen(id)))); + int32_t result = uspoof_check2UnicodeString(sc, idStr, checkResult, status); + return result; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusable(const USpoofChecker *sc, + const UChar *id1, int32_t length1, + const UChar *id2, int32_t length2, + UErrorCode *status) { + SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + if (length1 < -1 || length2 < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UnicodeString id1Str((length1==-1), id1, length1); // Aliasing constructor + UnicodeString id2Str((length2==-1), id2, length2); // Aliasing constructor + return uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status); +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusableUTF8(const USpoofChecker *sc, + const char *id1, int32_t length1, + const char *id2, int32_t length2, + UErrorCode *status) { + SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + if (length1 < -1 || length2 < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + UnicodeString id1Str = UnicodeString::fromUTF8(StringPiece(id1, length1>=0? length1 : static_cast<int32_t>(uprv_strlen(id1)))); + UnicodeString id2Str = UnicodeString::fromUTF8(StringPiece(id2, length2>=0? length2 : static_cast<int32_t>(uprv_strlen(id2)))); + int32_t results = uspoof_areConfusableUnicodeString(sc, id1Str, id2Str, status); + return results; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_areConfusableUnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &id1, + const icu::UnicodeString &id2, + UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + // + // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable, + // and for definitions of the types (single, whole, mixed-script) of confusables. + + // We only care about a few of the check flags. Ignore the others. + // If no tests relevant to this function have been specified, return an error. + // TODO: is this really the right thing to do? It's probably an error on the caller's part, + // but logically we would just return 0 (no error). + if ((This->fChecks & USPOOF_CONFUSABLE) == 0) { + *status = U_INVALID_STATE_ERROR; + return 0; + } + + // Compute the skeletons and check for confusability. + UnicodeString id1Skeleton; + uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id1, id1Skeleton, status); + UnicodeString id2Skeleton; + uspoof_getSkeletonUnicodeString(sc, 0 /* deprecated */, id2, id2Skeleton, status); + if (U_FAILURE(*status)) { return 0; } + if (id1Skeleton != id2Skeleton) { + return 0; + } + + // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes + // of confusables according to UTS 39 section 4. + // Start by computing the resolved script sets of id1 and id2. + ScriptSet id1RSS; + This->getResolvedScriptSet(id1, id1RSS, *status); + ScriptSet id2RSS; + This->getResolvedScriptSet(id2, id2RSS, *status); + + // Turn on all applicable flags + int32_t result = 0; + if (id1RSS.intersects(id2RSS)) { + result |= USPOOF_SINGLE_SCRIPT_CONFUSABLE; + } else { + result |= USPOOF_MIXED_SCRIPT_CONFUSABLE; + if (!id1RSS.isEmpty() && !id2RSS.isEmpty()) { + result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE; + } + } + + // Turn off flags that the user doesn't want + if ((This->fChecks & USPOOF_SINGLE_SCRIPT_CONFUSABLE) == 0) { + result &= ~USPOOF_SINGLE_SCRIPT_CONFUSABLE; + } + if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) == 0) { + result &= ~USPOOF_MIXED_SCRIPT_CONFUSABLE; + } + if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) == 0) { + result &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE; + } + + return result; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_checkUnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &id, + int32_t *position, + UErrorCode *status) { + + // Backwards compatibility: + if (position != NULL) { + *position = 0; + } + + // Delegate to uspoof_check2 + return uspoof_check2UnicodeString(sc, id, NULL, status); +} + +namespace { + +int32_t checkImpl(const SpoofImpl* This, const UnicodeString& id, CheckResult* checkResult, UErrorCode* status) { + U_ASSERT(This != NULL); + U_ASSERT(checkResult != NULL); + checkResult->clear(); + int32_t result = 0; + + if (0 != (This->fChecks & USPOOF_RESTRICTION_LEVEL)) { + URestrictionLevel idRestrictionLevel = This->getRestrictionLevel(id, *status); + if (idRestrictionLevel > This->fRestrictionLevel) { + result |= USPOOF_RESTRICTION_LEVEL; + } + checkResult->fRestrictionLevel = idRestrictionLevel; + } + + if (0 != (This->fChecks & USPOOF_MIXED_NUMBERS)) { + UnicodeSet numerics; + This->getNumerics(id, numerics, *status); + if (numerics.size() > 1) { + result |= USPOOF_MIXED_NUMBERS; + } + checkResult->fNumerics = numerics; // UnicodeSet::operator= + } + + if (0 != (This->fChecks & USPOOF_HIDDEN_OVERLAY)) { + int32_t index = This->findHiddenOverlay(id, *status); + if (index != -1) { + result |= USPOOF_HIDDEN_OVERLAY; + } + } + + + if (0 != (This->fChecks & USPOOF_CHAR_LIMIT)) { + int32_t i; + UChar32 c; + int32_t length = id.length(); + for (i=0; i<length ;) { + c = id.char32At(i); + i += U16_LENGTH(c); + if (!This->fAllowedCharsSet->contains(c)) { + result |= USPOOF_CHAR_LIMIT; + break; + } + } + } + + if (0 != (This->fChecks & USPOOF_INVISIBLE)) { + // This check needs to be done on NFD input + UnicodeString nfdText; + gNfdNormalizer->normalize(id, nfdText, *status); + int32_t nfdLength = nfdText.length(); + + // scan for more than one occurrence of the same non-spacing mark + // in a sequence of non-spacing marks. + int32_t i; + UChar32 c; + UChar32 firstNonspacingMark = 0; + UBool haveMultipleMarks = FALSE; + UnicodeSet marksSeenSoFar; // Set of combining marks in a single combining sequence. + + for (i=0; i<nfdLength ;) { + c = nfdText.char32At(i); + i += U16_LENGTH(c); + if (u_charType(c) != U_NON_SPACING_MARK) { + firstNonspacingMark = 0; + if (haveMultipleMarks) { + marksSeenSoFar.clear(); + haveMultipleMarks = FALSE; + } + continue; + } + if (firstNonspacingMark == 0) { + firstNonspacingMark = c; + continue; + } + if (!haveMultipleMarks) { + marksSeenSoFar.add(firstNonspacingMark); + haveMultipleMarks = TRUE; + } + if (marksSeenSoFar.contains(c)) { + // report the error, and stop scanning. + // No need to find more than the first failure. + result |= USPOOF_INVISIBLE; + break; + } + marksSeenSoFar.add(c); + } + } + + checkResult->fChecks = result; + return checkResult->toCombinedBitmask(This->fChecks); +} + +} // namespace + +U_CAPI int32_t U_EXPORT2 +uspoof_check2UnicodeString(const USpoofChecker *sc, + const icu::UnicodeString &id, + USpoofCheckResult* checkResult, + UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + return FALSE; + } + + if (checkResult != NULL) { + CheckResult* ThisCheckResult = CheckResult::validateThis(checkResult, *status); + if (ThisCheckResult == NULL) { + return FALSE; + } + return checkImpl(This, id, ThisCheckResult, status); + } else { + // Stack-allocate the checkResult since this method doesn't return it + CheckResult stackCheckResult; + return checkImpl(This, id, &stackCheckResult, status); + } +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_getSkeleton(const USpoofChecker *sc, + uint32_t type, + const UChar *id, int32_t length, + UChar *dest, int32_t destCapacity, + UErrorCode *status) { + + SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UnicodeString idStr((length==-1), id, length); // Aliasing constructor + UnicodeString destStr; + uspoof_getSkeletonUnicodeString(sc, type, idStr, destStr, status); + destStr.extract(dest, destCapacity, *status); + return destStr.length(); +} + + + +U_I18N_API UnicodeString & U_EXPORT2 +uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, + uint32_t /*type*/, + const UnicodeString &id, + UnicodeString &dest, + UErrorCode *status) { + const SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return dest; + } + + UnicodeString nfdId; + gNfdNormalizer->normalize(id, nfdId, *status); + + // Apply the skeleton mapping to the NFD normalized input string + // Accumulate the skeleton, possibly unnormalized, in a UnicodeString. + int32_t inputIndex = 0; + UnicodeString skelStr; + int32_t normalizedLen = nfdId.length(); + for (inputIndex=0; inputIndex < normalizedLen; ) { + UChar32 c = nfdId.char32At(inputIndex); + inputIndex += U16_LENGTH(c); + This->fSpoofData->confusableLookup(c, skelStr); + } + + gNfdNormalizer->normalize(skelStr, dest, *status); + return dest; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_getSkeletonUTF8(const USpoofChecker *sc, + uint32_t type, + const char *id, int32_t length, + char *dest, int32_t destCapacity, + UErrorCode *status) { + SpoofImpl::validateThis(sc, *status); + if (U_FAILURE(*status)) { + return 0; + } + if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL)) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + UnicodeString srcStr = UnicodeString::fromUTF8(StringPiece(id, length>=0 ? length : static_cast<int32_t>(uprv_strlen(id)))); + UnicodeString destStr; + uspoof_getSkeletonUnicodeString(sc, type, srcStr, destStr, status); + if (U_FAILURE(*status)) { + return 0; + } + + int32_t lengthInUTF8 = 0; + u_strToUTF8(dest, destCapacity, &lengthInUTF8, + destStr.getBuffer(), destStr.length(), status); + return lengthInUTF8; +} + + +U_CAPI int32_t U_EXPORT2 +uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) { + SpoofImpl *This = SpoofImpl::validateThis(sc, *status); + if (This == NULL) { + U_ASSERT(U_FAILURE(*status)); + return 0; + } + + return This->fSpoofData->serialize(buf, capacity, *status); +} + +U_CAPI const USet * U_EXPORT2 +uspoof_getInclusionSet(UErrorCode *status) { + umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status); + return gInclusionSet->toUSet(); +} + +U_CAPI const USet * U_EXPORT2 +uspoof_getRecommendedSet(UErrorCode *status) { + umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status); + return gRecommendedSet->toUSet(); +} + +U_I18N_API const UnicodeSet * U_EXPORT2 +uspoof_getInclusionUnicodeSet(UErrorCode *status) { + umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status); + return gInclusionSet; +} + +U_I18N_API const UnicodeSet * U_EXPORT2 +uspoof_getRecommendedUnicodeSet(UErrorCode *status) { + umtx_initOnce(gSpoofInitStaticsOnce, &initializeStatics, *status); + return gRecommendedSet; +} + +//------------------ +// CheckResult APIs +//------------------ + +U_CAPI USpoofCheckResult* U_EXPORT2 +uspoof_openCheckResult(UErrorCode *status) { + CheckResult* checkResult = new CheckResult(); + if (checkResult == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + return NULL; + } + return checkResult->asUSpoofCheckResult(); +} + +U_CAPI void U_EXPORT2 +uspoof_closeCheckResult(USpoofCheckResult* checkResult) { + UErrorCode status = U_ZERO_ERROR; + CheckResult* This = CheckResult::validateThis(checkResult, status); + delete This; +} + +U_CAPI int32_t U_EXPORT2 +uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status) { + const CheckResult* This = CheckResult::validateThis(checkResult, *status); + if (U_FAILURE(*status)) { return 0; } + return This->fChecks; +} + +U_CAPI URestrictionLevel U_EXPORT2 +uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status) { + const CheckResult* This = CheckResult::validateThis(checkResult, *status); + if (U_FAILURE(*status)) { return USPOOF_UNRESTRICTIVE; } + return This->fRestrictionLevel; +} + +U_CAPI const USet* U_EXPORT2 +uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status) { + const CheckResult* This = CheckResult::validateThis(checkResult, *status); + if (U_FAILURE(*status)) { return NULL; } + return This->fNumerics.toUSet(); +} + + + +#endif // !UCONFIG_NO_NORMALIZATION diff --git a/thirdparty/icu4c/i18n/uspoof_impl.cpp b/thirdparty/icu4c/i18n/uspoof_impl.cpp new file mode 100644 index 0000000000..b283d81321 --- /dev/null +++ b/thirdparty/icu4c/i18n/uspoof_impl.cpp @@ -0,0 +1,959 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +********************************************************************** +* Copyright (C) 2008-2016, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ + +#include "unicode/utypes.h" +#include "unicode/uspoof.h" +#include "unicode/uchar.h" +#include "unicode/uniset.h" +#include "unicode/utf16.h" +#include "utrie2.h" +#include "cmemory.h" +#include "cstring.h" +#include "scriptset.h" +#include "umutex.h" +#include "udataswp.h" +#include "uassert.h" +#include "ucln_in.h" +#include "uspoof_impl.h" + +#if !UCONFIG_NO_NORMALIZATION + + +U_NAMESPACE_BEGIN + +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) + +SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) { + construct(status); + fSpoofData = data; +} + +SpoofImpl::SpoofImpl(UErrorCode& status) { + construct(status); + + // TODO: Call this method where it is actually needed, instead of in the + // constructor, to allow for lazy data loading. See #12696. + fSpoofData = SpoofData::getDefault(status); +} + +SpoofImpl::SpoofImpl() { + UErrorCode status = U_ZERO_ERROR; + construct(status); + + // TODO: Call this method where it is actually needed, instead of in the + // constructor, to allow for lazy data loading. See #12696. + fSpoofData = SpoofData::getDefault(status); +} + +void SpoofImpl::construct(UErrorCode& status) { + fChecks = USPOOF_ALL_CHECKS; + fSpoofData = NULL; + fAllowedCharsSet = NULL; + fAllowedLocales = NULL; + fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; + + if (U_FAILURE(status)) { return; } + + UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); + fAllowedCharsSet = allowedCharsSet; + fAllowedLocales = uprv_strdup(""); + if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + allowedCharsSet->freeze(); +} + + +// Copy Constructor, used by the user level clone() function. +SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : + fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , + fAllowedLocales(NULL) { + if (U_FAILURE(status)) { + return; + } + fChecks = src.fChecks; + if (src.fSpoofData != NULL) { + fSpoofData = src.fSpoofData->addReference(); + } + fAllowedCharsSet = src.fAllowedCharsSet->clone(); + fAllowedLocales = uprv_strdup(src.fAllowedLocales); + if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + } + fRestrictionLevel = src.fRestrictionLevel; +} + +SpoofImpl::~SpoofImpl() { + if (fSpoofData != NULL) { + fSpoofData->removeReference(); // Will delete if refCount goes to zero. + } + delete fAllowedCharsSet; + uprv_free((void *)fAllowedLocales); +} + +// Cast this instance as a USpoofChecker for the C API. +USpoofChecker *SpoofImpl::asUSpoofChecker() { + return exportForC(); +} + +// +// Incoming parameter check on Status and the SpoofChecker object +// received from the C API. +// +const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { + auto* This = validate(sc, status); + if (U_FAILURE(status)) { + return NULL; + } + if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) { + return NULL; + } + return This; +} + +SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { + return const_cast<SpoofImpl *> + (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); +} + + +void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { + UnicodeSet allowedChars; + UnicodeSet *tmpSet = NULL; + const char *locStart = localesList; + const char *locEnd = NULL; + const char *localesListEnd = localesList + uprv_strlen(localesList); + int32_t localeListCount = 0; // Number of locales provided by caller. + + // Loop runs once per locale from the localesList, a comma separated list of locales. + do { + locEnd = uprv_strchr(locStart, ','); + if (locEnd == NULL) { + locEnd = localesListEnd; + } + while (*locStart == ' ') { + locStart++; + } + const char *trimmedEnd = locEnd-1; + while (trimmedEnd > locStart && *trimmedEnd == ' ') { + trimmedEnd--; + } + if (trimmedEnd <= locStart) { + break; + } + const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); + localeListCount++; + + // We have one locale from the locales list. + // Add the script chars for this locale to the accumulating set of allowed chars. + // If the locale is no good, we will be notified back via status. + addScriptChars(locale, &allowedChars, status); + uprv_free((void *)locale); + if (U_FAILURE(status)) { + break; + } + locStart = locEnd + 1; + } while (locStart < localesListEnd); + + // If our caller provided an empty list of locales, we disable the allowed characters checking + if (localeListCount == 0) { + uprv_free((void *)fAllowedLocales); + fAllowedLocales = uprv_strdup(""); + tmpSet = new UnicodeSet(0, 0x10ffff); + if (fAllowedLocales == NULL || tmpSet == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + tmpSet->freeze(); + delete fAllowedCharsSet; + fAllowedCharsSet = tmpSet; + fChecks &= ~USPOOF_CHAR_LIMIT; + return; + } + + + // Add all common and inherited characters to the set of allowed chars. + UnicodeSet tempSet; + tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); + allowedChars.addAll(tempSet); + tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); + allowedChars.addAll(tempSet); + + // If anything went wrong, we bail out without changing + // the state of the spoof checker. + if (U_FAILURE(status)) { + return; + } + + // Store the updated spoof checker state. + tmpSet = allowedChars.clone(); + const char *tmpLocalesList = uprv_strdup(localesList); + if (tmpSet == NULL || tmpLocalesList == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_free((void *)fAllowedLocales); + fAllowedLocales = tmpLocalesList; + tmpSet->freeze(); + delete fAllowedCharsSet; + fAllowedCharsSet = tmpSet; + fChecks |= USPOOF_CHAR_LIMIT; +} + + +const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { + return fAllowedLocales; +} + + +// Given a locale (a language), add all the characters from all of the scripts used with that language +// to the allowedChars UnicodeSet + +void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { + UScriptCode scripts[30]; + + int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status); + if (U_FAILURE(status)) { + return; + } + if (status == U_USING_DEFAULT_WARNING) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + UnicodeSet tmpSet; + int32_t i; + for (i=0; i<numScripts; i++) { + tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); + allowedChars->addAll(tmpSet); + } +} + +// Computes the augmented script set for a code point, according to UTS 39 section 5.1. +void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { + result.resetAll(); + result.setScriptExtensions(codePoint, status); + if (U_FAILURE(status)) { return; } + + // Section 5.1 step 1 + if (result.test(USCRIPT_HAN, status)) { + result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); + result.set(USCRIPT_JAPANESE, status); + result.set(USCRIPT_KOREAN, status); + } + if (result.test(USCRIPT_HIRAGANA, status)) { + result.set(USCRIPT_JAPANESE, status); + } + if (result.test(USCRIPT_KATAKANA, status)) { + result.set(USCRIPT_JAPANESE, status); + } + if (result.test(USCRIPT_HANGUL, status)) { + result.set(USCRIPT_KOREAN, status); + } + if (result.test(USCRIPT_BOPOMOFO, status)) { + result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); + } + + // Section 5.1 step 2 + if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { + result.setAll(); + } +} + +// Computes the resolved script set for a string, according to UTS 39 section 5.1. +void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const { + getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status); +} + +// Computes the resolved script set for a string, omitting characters having the specified script. +// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. +void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { + result.setAll(); + + ScriptSet temp; + UChar32 codePoint; + for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { + codePoint = input.char32At(i); + + // Compute the augmented script set for the character + getAugmentedScriptSet(codePoint, temp, status); + if (U_FAILURE(status)) { return; } + + // Intersect the augmented script set with the resolved script set, but only if the character doesn't + // have the script specified in the function call + if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { + result.intersect(temp); + } + } +} + +// Computes the set of numerics for a string, according to UTS 39 section 5.3. +void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { + result.clear(); + + UChar32 codePoint; + for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { + codePoint = input.char32At(i); + + // Store a representative character for each kind of decimal digit + if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { + // Store the zero character as a representative for comparison. + // Unicode guarantees it is codePoint - value + result.add(codePoint - (UChar32)u_getNumericValue(codePoint)); + } + } +} + +// Computes the restriction level of a string, according to UTS 39 section 5.2. +URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { + // Section 5.2 step 1: + if (!fAllowedCharsSet->containsAll(input)) { + return USPOOF_UNRESTRICTIVE; + } + + // Section 5.2 step 2 + // Java use a static UnicodeSet for this test. In C++, avoid the static variable + // and just do a simple for loop. + UBool allASCII = TRUE; + for (int32_t i=0, length=input.length(); i<length; i++) { + if (input.charAt(i) > 0x7f) { + allASCII = FALSE; + break; + } + } + if (allASCII) { + return USPOOF_ASCII; + } + + // Section 5.2 steps 3: + ScriptSet resolvedScriptSet; + getResolvedScriptSet(input, resolvedScriptSet, status); + if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } + + // Section 5.2 step 4: + if (!resolvedScriptSet.isEmpty()) { + return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; + } + + // Section 5.2 step 5: + ScriptSet resolvedNoLatn; + getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); + if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } + + // Section 5.2 step 6: + if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) + || resolvedNoLatn.test(USCRIPT_JAPANESE, status) + || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { + return USPOOF_HIGHLY_RESTRICTIVE; + } + + // Section 5.2 step 7: + if (!resolvedNoLatn.isEmpty() + && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) + && !resolvedNoLatn.test(USCRIPT_GREEK, status) + && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { + return USPOOF_MODERATELY_RESTRICTIVE; + } + + // Section 5.2 step 8: + return USPOOF_MINIMALLY_RESTRICTIVE; +} + +int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const { + bool sawLeadCharacter = false; + for (int32_t i=0; i<input.length();) { + UChar32 cp = input.char32At(i); + if (sawLeadCharacter && cp == 0x0307) { + return i; + } + uint8_t combiningClass = u_getCombiningClass(cp); + // Skip over characters except for those with combining class 0 (non-combining characters) or with + // combining class 230 (same class as U+0307) + U_ASSERT(u_getCombiningClass(0x0307) == 230); + if (combiningClass == 0 || combiningClass == 230) { + sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp); + } + i += U16_LENGTH(cp); + } + return -1; +} + +static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) { + return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' || + u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED); +} + +bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const { + if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { + return true; + } + UnicodeString skelStr; + fSpoofData->confusableLookup(cp, skelStr); + UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1)); + if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { + return true; + } + return false; +} + + + +// Convert a text format hex number. Utility function used by builder code. Static. +// Input: UChar *string text. Output: a UChar32 +// Input has been pre-checked, and will have no non-hex chars. +// The number must fall in the code point range of 0..0x10ffff +// Static Function. +UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { + if (U_FAILURE(status)) { + return 0; + } + U_ASSERT(limit-start > 0); + uint32_t val = 0; + int i; + for (i=start; i<limit; i++) { + int digitVal = s[i] - 0x30; + if (digitVal>9) { + digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' + } + if (digitVal>15) { + digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' + } + U_ASSERT(digitVal <= 0xf); + val <<= 4; + val += digitVal; + } + if (val > 0x10ffff) { + status = U_PARSE_ERROR; + val = 0; + } + return (UChar32)val; +} + + +//----------------------------------------- +// +// class CheckResult Implementation +// +//----------------------------------------- + +CheckResult::CheckResult() { + clear(); +} + +USpoofCheckResult* CheckResult::asUSpoofCheckResult() { + return exportForC(); +} + +// +// Incoming parameter check on Status and the CheckResult object +// received from the C API. +// +const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) { + return validate(ptr, status); +} + +CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) { + return validate(ptr, status); +} + +void CheckResult::clear() { + fChecks = 0; + fNumerics.clear(); + fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE; +} + +int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) { + if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) { + return fChecks | fRestrictionLevel; + } else { + return fChecks; + } +} + +CheckResult::~CheckResult() { +} + +//---------------------------------------------------------------------------------------------- +// +// class SpoofData Implementation +// +//---------------------------------------------------------------------------------------------- + + +UBool SpoofData::validateDataVersion(UErrorCode &status) const { + if (U_FAILURE(status) || + fRawData == NULL || + fRawData->fMagic != USPOOF_MAGIC || + fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION || + fRawData->fFormatVersion[1] != 0 || + fRawData->fFormatVersion[2] != 0 || + fRawData->fFormatVersion[3] != 0) { + status = U_INVALID_FORMAT_ERROR; + return FALSE; + } + return TRUE; +} + +static UBool U_CALLCONV +spoofDataIsAcceptable(void *context, + const char * /* type */, const char * /*name*/, + const UDataInfo *pInfo) { + if( + pInfo->size >= 20 && + pInfo->isBigEndian == U_IS_BIG_ENDIAN && + pInfo->charsetFamily == U_CHARSET_FAMILY && + pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu " + pInfo->dataFormat[1] == 0x66 && + pInfo->dataFormat[2] == 0x75 && + pInfo->dataFormat[3] == 0x20 && + pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION + ) { + UVersionInfo *version = static_cast<UVersionInfo *>(context); + if(version != NULL) { + uprv_memcpy(version, pInfo->dataVersion, 4); + } + return TRUE; + } else { + return FALSE; + } +} + +// Methods for the loading of the default confusables data file. The confusable +// data is loaded only when it is needed. +// +// SpoofData::getDefault() - Return the default confusables data, and call the +// initOnce() if it is not available. Adds a reference +// to the SpoofData that the caller is responsible for +// decrementing when they are done with the data. +// +// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData +// is shared by all spoof checkers using the default data. +// +// uspoof_cleanupDefaultData - Called during cleanup. +// + +static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER; +static SpoofData* gDefaultSpoofData; + +static UBool U_CALLCONV +uspoof_cleanupDefaultData(void) { + if (gDefaultSpoofData) { + // Will delete, assuming all user-level spoof checkers were closed. + gDefaultSpoofData->removeReference(); + gDefaultSpoofData = nullptr; + gSpoofInitDefaultOnce.reset(); + } + return TRUE; +} + +static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) { + UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables", + spoofDataIsAcceptable, + nullptr, // context, would receive dataVersion if supplied. + &status); + if (U_FAILURE(status)) { return; } + gDefaultSpoofData = new SpoofData(udm, status); + if (U_FAILURE(status)) { + delete gDefaultSpoofData; + gDefaultSpoofData = nullptr; + return; + } + if (gDefaultSpoofData == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData); +} + +SpoofData* SpoofData::getDefault(UErrorCode& status) { + umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status); + if (U_FAILURE(status)) { return NULL; } + gDefaultSpoofData->addReference(); + return gDefaultSpoofData; +} + + + +SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) +{ + reset(); + if (U_FAILURE(status)) { + return; + } + fUDM = udm; + // fRawData is non-const because it may be constructed by the data builder. + fRawData = reinterpret_cast<SpoofDataHeader *>( + const_cast<void *>(udata_getMemory(udm))); + validateDataVersion(status); + initPtrs(status); +} + + +SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) +{ + reset(); + if (U_FAILURE(status)) { + return; + } + if ((size_t)length < sizeof(SpoofDataHeader)) { + status = U_INVALID_FORMAT_ERROR; + return; + } + if (data == NULL) { + status = U_ILLEGAL_ARGUMENT_ERROR; + return; + } + void *ncData = const_cast<void *>(data); + fRawData = static_cast<SpoofDataHeader *>(ncData); + if (length < fRawData->fLength) { + status = U_INVALID_FORMAT_ERROR; + return; + } + validateDataVersion(status); + initPtrs(status); +} + + +// Spoof Data constructor for use from data builder. +// Initializes a new, empty data area that will be populated later. +SpoofData::SpoofData(UErrorCode &status) { + reset(); + if (U_FAILURE(status)) { + return; + } + fDataOwned = true; + + // The spoof header should already be sized to be a multiple of 16 bytes. + // Just in case it's not, round it up. + uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; + U_ASSERT(initialSize == sizeof(SpoofDataHeader)); + + fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); + fMemLimit = initialSize; + if (fRawData == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memset(fRawData, 0, initialSize); + + fRawData->fMagic = USPOOF_MAGIC; + fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION; + fRawData->fFormatVersion[1] = 0; + fRawData->fFormatVersion[2] = 0; + fRawData->fFormatVersion[3] = 0; + initPtrs(status); +} + +// reset() - initialize all fields. +// Should be updated if any new fields are added. +// Called by constructors to put things in a known initial state. +void SpoofData::reset() { + fRawData = NULL; + fDataOwned = FALSE; + fUDM = NULL; + fMemLimit = 0; + fRefCount = 1; + fCFUKeys = NULL; + fCFUValues = NULL; + fCFUStrings = NULL; +} + + +// SpoofData::initPtrs() +// Initialize the pointers to the various sections of the raw data. +// +// This function is used both during the Trie building process (multiple +// times, as the individual data sections are added), and +// during the opening of a Spoof Checker from prebuilt data. +// +// The pointers for non-existent data sections (identified by an offset of 0) +// are set to NULL. +// +// Note: During building the data, adding each new data section +// reallocs the raw data area, which likely relocates it, which +// in turn requires reinitializing all of the pointers into it, hence +// multiple calls to this function during building. +// +void SpoofData::initPtrs(UErrorCode &status) { + fCFUKeys = NULL; + fCFUValues = NULL; + fCFUStrings = NULL; + if (U_FAILURE(status)) { + return; + } + if (fRawData->fCFUKeys != 0) { + fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); + } + if (fRawData->fCFUStringIndex != 0) { + fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); + } + if (fRawData->fCFUStringTable != 0) { + fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); + } +} + + +SpoofData::~SpoofData() { + if (fDataOwned) { + uprv_free(fRawData); + } + fRawData = NULL; + if (fUDM != NULL) { + udata_close(fUDM); + } + fUDM = NULL; +} + + +void SpoofData::removeReference() { + if (umtx_atomic_dec(&fRefCount) == 0) { + delete this; + } +} + + +SpoofData *SpoofData::addReference() { + umtx_atomic_inc(&fRefCount); + return this; +} + + +void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { + if (U_FAILURE(status)) { + return NULL; + } + if (!fDataOwned) { + UPRV_UNREACHABLE_EXIT; + } + + numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 + uint32_t returnOffset = fMemLimit; + fMemLimit += numBytes; + fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); + fRawData->fLength = fMemLimit; + uprv_memset((char *)fRawData + returnOffset, 0, numBytes); + initPtrs(status); + return (char *)fRawData + returnOffset; +} + +int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const { + int32_t dataSize = fRawData->fLength; + if (capacity < dataSize) { + status = U_BUFFER_OVERFLOW_ERROR; + return dataSize; + } + uprv_memcpy(buf, fRawData, dataSize); + return dataSize; +} + +int32_t SpoofData::size() const { + return fRawData->fLength; +} + +//------------------------------- +// +// Front-end APIs for SpoofData +// +//------------------------------- + +int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const { + // Perform a binary search. + // [lo, hi), i.e lo is inclusive, hi is exclusive. + // The result after the loop will be in lo. + int32_t lo = 0; + int32_t hi = length(); + do { + int32_t mid = (lo + hi) / 2; + if (codePointAt(mid) > inChar) { + hi = mid; + } else if (codePointAt(mid) < inChar) { + lo = mid; + } else { + // Found result. Break early. + lo = mid; + break; + } + } while (hi - lo > 1); + + // Did we find an entry? If not, the char maps to itself. + if (codePointAt(lo) != inChar) { + dest.append(inChar); + return 1; + } + + // Add the element to the string builder and return. + return appendValueTo(lo, dest); +} + +int32_t SpoofData::length() const { + return fRawData->fCFUKeysSize; +} + +UChar32 SpoofData::codePointAt(int32_t index) const { + return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]); +} + +int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const { + int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]); + + // Value is either a char (for strings of length 1) or + // an index into the string table (for longer strings) + uint16_t value = fCFUValues[index]; + if (stringLength == 1) { + dest.append((UChar)value); + } else { + dest.append(fCFUStrings + value, stringLength); + } + + return stringLength; +} + + +U_NAMESPACE_END + +U_NAMESPACE_USE + +//----------------------------------------------------------------------------- +// +// uspoof_swap - byte swap and char encoding swap of spoof data +// +//----------------------------------------------------------------------------- +U_CAPI int32_t U_EXPORT2 +uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, + UErrorCode *status) { + + if (status == NULL || U_FAILURE(*status)) { + return 0; + } + if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { + *status=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + // + // Check that the data header is for spoof data. + // (Header contents are defined in gencfu.cpp) + // + const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); + if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ + pInfo->dataFormat[1]==0x66 && + pInfo->dataFormat[2]==0x75 && + pInfo->dataFormat[3]==0x20 && + pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION && + pInfo->formatVersion[1]==0 && + pInfo->formatVersion[2]==0 && + pInfo->formatVersion[3]==0 )) { + udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " + "(format version %02x %02x %02x %02x) is not recognized\n", + pInfo->dataFormat[0], pInfo->dataFormat[1], + pInfo->dataFormat[2], pInfo->dataFormat[3], + pInfo->formatVersion[0], pInfo->formatVersion[1], + pInfo->formatVersion[2], pInfo->formatVersion[3]); + *status=U_UNSUPPORTED_ERROR; + return 0; + } + + // + // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific + // header). This swap also conveniently gets us + // the size of the ICU d.h., which lets us locate the start + // of the uspoof specific data. + // + int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); + + + // + // Get the Spoof Data Header, and check that it appears to be OK. + // + // + const uint8_t *inBytes =(const uint8_t *)inData+headerSize; + SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; + if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || + ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) + { + udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); + *status=U_UNSUPPORTED_ERROR; + return 0; + } + + // + // Prefight operation? Just return the size + // + int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); + int32_t totalSize = headerSize + spoofDataLength; + if (length < 0) { + return totalSize; + } + + // + // Check that length passed in is consistent with length from Spoof data header. + // + if (length < totalSize) { + udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", + spoofDataLength); + *status=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + + // + // Swap the Data. Do the data itself first, then the Spoof Data Header, because + // we need to reference the header to locate the data, and an + // inplace swap of the header leaves it unusable. + // + uint8_t *outBytes = (uint8_t *)outData + headerSize; + SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; + + int32_t sectionStart; + int32_t sectionLength; + + // + // If not swapping in place, zero out the output buffer before starting. + // Gaps may exist between the individual sections, and these must be zeroed in + // the output buffer. The simplest way to do that is to just zero the whole thing. + // + if (inBytes != outBytes) { + uprv_memset(outBytes, 0, spoofDataLength); + } + + // Confusables Keys Section (fCFUKeys) + sectionStart = ds->readUInt32(spoofDH->fCFUKeys); + sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; + ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); + + // String Index Section + sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); + sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; + ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); + + // String Table Section + sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); + sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; + ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); + + // And, last, swap the header itself. + // int32_t fMagic // swap this + // uint8_t fFormatVersion[4] // Do not swap this, just copy + // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. + // + uint32_t magic = ds->readUInt32(spoofDH->fMagic); + ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); + + if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { + uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); + } + // swap starting at fLength + ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); + + return totalSize; +} + +#endif + + diff --git a/thirdparty/icu4c/i18n/uspoof_impl.h b/thirdparty/icu4c/i18n/uspoof_impl.h new file mode 100644 index 0000000000..e75ae262bd --- /dev/null +++ b/thirdparty/icu4c/i18n/uspoof_impl.h @@ -0,0 +1,343 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +/* +*************************************************************************** +* Copyright (C) 2008-2013, International Business Machines Corporation +* and others. All Rights Reserved. +*************************************************************************** +* +* uspoof_impl.h +* +* Implementation header for spoof detection +* +*/ + +#ifndef USPOOFIM_H +#define USPOOFIM_H + +#include "uassert.h" +#include "unicode/utypes.h" +#include "unicode/uspoof.h" +#include "unicode/uscript.h" +#include "unicode/udata.h" +#include "udataswp.h" +#include "utrie2.h" + +#if !UCONFIG_NO_NORMALIZATION + +#ifdef __cplusplus + +#include "capi_helper.h" + +U_NAMESPACE_BEGIN + +// The maximum length (in UTF-16 UChars) of the skeleton replacement string resulting from +// a single input code point. This is function of the unicode.org data. +#define USPOOF_MAX_SKELETON_EXPANSION 20 + +// The default stack buffer size for copies or conversions or normalizations +// of input strings being checked. (Used in multiple places.) +#define USPOOF_STACK_BUFFER_SIZE 100 + +// Magic number for sanity checking spoof data. +#define USPOOF_MAGIC 0x3845fdef + +// Magic number for sanity checking spoof checkers. +#define USPOOF_CHECK_MAGIC 0x2734ecde + +class ScriptSet; +class SpoofData; +struct SpoofDataHeader; +class ConfusableDataUtils; + +/** + * Class SpoofImpl corresponds directly to the plain C API opaque type + * USpoofChecker. One can be cast to the other. + */ +class SpoofImpl : public UObject, + public IcuCApiHelper<USpoofChecker, SpoofImpl, USPOOF_MAGIC> { +public: + SpoofImpl(SpoofData *data, UErrorCode& status); + SpoofImpl(UErrorCode& status); + SpoofImpl(); + void construct(UErrorCode& status); + virtual ~SpoofImpl(); + + /** Copy constructor, used by the user level uspoof_clone() function. + */ + SpoofImpl(const SpoofImpl &src, UErrorCode &status); + + USpoofChecker *asUSpoofChecker(); + static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status); + static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status); + + /** Set and Get AllowedLocales, implementations of the corresponding API */ + void setAllowedLocales(const char *localesList, UErrorCode &status); + const char * getAllowedLocales(UErrorCode &status); + + // Add (union) to the UnicodeSet all of the characters for the scripts used for + // the specified locale. Part of the implementation of setAllowedLocales. + void addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status); + + // Functions implementing the features of UTS 39 section 5. + static void getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status); + void getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const; + void getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const; + void getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& status) const; + URestrictionLevel getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const; + + int32_t findHiddenOverlay(const UnicodeString& input, UErrorCode& status) const; + bool isIllegalCombiningDotLeadCharacter(UChar32 cp) const; + + /** parse a hex number. Untility used by the builders. */ + static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status); + + static UClassID U_EXPORT2 getStaticClassID(void); + virtual UClassID getDynamicClassID(void) const override; + + // + // Data Members + // + + int32_t fChecks; // Bit vector of checks to perform. + + SpoofData *fSpoofData; + + const UnicodeSet *fAllowedCharsSet; // The UnicodeSet of allowed characters. + // for this Spoof Checker. Defaults to all chars. + + const char *fAllowedLocales; // The list of allowed locales. + URestrictionLevel fRestrictionLevel; // The maximum restriction level for an acceptable identifier. +}; + +/** + * Class CheckResult corresponds directly to the plain C API opaque type + * USpoofCheckResult. One can be cast to the other. + */ +class CheckResult : public UObject, + public IcuCApiHelper<USpoofCheckResult, CheckResult, USPOOF_CHECK_MAGIC> { +public: + CheckResult(); + virtual ~CheckResult(); + + USpoofCheckResult *asUSpoofCheckResult(); + static CheckResult *validateThis(USpoofCheckResult *ptr, UErrorCode &status); + static const CheckResult *validateThis(const USpoofCheckResult *ptr, UErrorCode &status); + + void clear(); + + // Used to convert this CheckResult to the older int32_t return value API + int32_t toCombinedBitmask(int32_t expectedChecks); + + // Data Members + int32_t fChecks; // Bit vector of checks that were failed. + UnicodeSet fNumerics; // Set of numerics found in the string. + URestrictionLevel fRestrictionLevel; // The restriction level of the string. +}; + + +// +// Confusable Mappings Data Structures, version 2.0 +// +// For the confusable data, we are essentially implementing a map, +// key: a code point +// value: a string. Most commonly one char in length, but can be more. +// +// The keys are stored as a sorted array of 32 bit ints. +// bits 0-23 a code point value +// bits 24-31 length of value string, in UChars (between 1 and 256 UChars). +// The key table is sorted in ascending code point order. (not on the +// 32 bit int value, the flag bits do not participate in the sorting.) +// +// Lookup is done by means of a binary search in the key table. +// +// The corresponding values are kept in a parallel array of 16 bit ints. +// If the value string is of length 1, it is literally in the value array. +// For longer strings, the value array contains an index into the strings table. +// +// String Table: +// The strings table contains all of the value strings (those of length two or greater) +// concatenated together into one long UChar (UTF-16) array. +// +// There is no nul character or other mark between adjacent strings. +// +//---------------------------------------------------------------------------- +// +// Changes from format version 1 to format version 2: +// 1) Removal of the whole-script confusable data tables. +// 2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask. +// 3) Expansion of string length value in the key bitmask from 2 bits to 8 bits. +// 4) Removal of the string lengths table since 8 bits is sufficient for the +// lengths of all entries in confusables.txt. + + + +// Internal functions for manipulating confusable data table keys +#define USPOOF_CONFUSABLE_DATA_FORMAT_VERSION 2 // version for ICU 58 +class ConfusableDataUtils { +public: + inline static UChar32 keyToCodePoint(int32_t key) { + return key & 0x00ffffff; + } + inline static int32_t keyToLength(int32_t key) { + return ((key & 0xff000000) >> 24) + 1; + } + inline static int32_t codePointAndLengthToKey(UChar32 codePoint, int32_t length) { + U_ASSERT((codePoint & 0x00ffffff) == codePoint); + U_ASSERT(length <= 256); + return codePoint | ((length - 1) << 24); + } +}; + + +//------------------------------------------------------------------------------------- +// +// SpoofData +// +// A small class that wraps the raw (usually memory mapped) spoof data. +// Serves two primary functions: +// 1. Convenience. Contains real pointers to the data, to avoid dealing with +// the offsets in the raw data. +// 2. Reference counting. When a spoof checker is cloned, the raw data is shared +// and must be retained until all checkers using the data are closed. +// Nothing in this struct includes state that is specific to any particular +// USpoofDetector object. +// +//--------------------------------------------------------------------------------------- +class SpoofData: public UMemory { + public: + static SpoofData* getDefault(UErrorCode &status); // Get standard ICU spoof data. + static void releaseDefault(); // Cleanup reference to default spoof data. + + SpoofData(UErrorCode &status); // Create new spoof data wrapper. + // Only used when building new data from rules. + + // Constructor for use when creating from prebuilt default data. + // A UDataMemory is what the ICU internal data loading functions provide. + // The udm is adopted by the SpoofData. + SpoofData(UDataMemory *udm, UErrorCode &status); + + // Constructor for use when creating from serialized data. + // + SpoofData(const void *serializedData, int32_t length, UErrorCode &status); + + // Check raw Spoof Data Version compatibility. + // Return true it looks good. + UBool validateDataVersion(UErrorCode &status) const; + + ~SpoofData(); // Destructor not normally used. + // Use removeReference() instead. + // Reference Counting functions. + // Clone of a user-level spoof detector increments the ref count on the data. + // Close of a user-level spoof detector decrements the ref count. + // If the data is owned by us, it will be deleted when count goes to zero. + SpoofData *addReference(); + void removeReference(); + + // Reset all fields to an initial state. + // Called from the top of all constructors. + void reset(); + + // Copy this instance's raw data buffer to the specified address. + int32_t serialize(void *buf, int32_t capacity, UErrorCode &status) const; + + // Get the total number of bytes of data backed by this SpoofData. + // Not to be confused with length, which returns the number of confusable entries. + int32_t size() const; + + // Get the confusable skeleton transform for a single code point. + // The result is a string with a length between 1 and 18 as of Unicode 9. + // This is the main public endpoint for this class. + // @return The length in UTF-16 code units of the substitution string. + int32_t confusableLookup(UChar32 inChar, UnicodeString &dest) const; + + // Get the number of confusable entries in this SpoofData. + int32_t length() const; + + // Get the code point (key) at the specified index. + UChar32 codePointAt(int32_t index) const; + + // Get the confusable skeleton (value) at the specified index. + // Append it to the specified UnicodeString&. + // @return The length in UTF-16 code units of the skeleton string. + int32_t appendValueTo(int32_t index, UnicodeString& dest) const; + + private: + // Reserve space in the raw data. For use by builder when putting together a + // new set of data. Init the new storage to zero, to prevent inconsistent + // results if it is not all otherwise set by the requester. + // Return: + // pointer to the new space that was added by this function. + void *reserveSpace(int32_t numBytes, UErrorCode &status); + + // initialize the pointers from this object to the raw data. + void initPtrs(UErrorCode &status); + + SpoofDataHeader *fRawData; // Ptr to the raw memory-mapped data + UBool fDataOwned; // True if the raw data is owned, and needs + // to be deleted when refcount goes to zero. + UDataMemory *fUDM; // If not NULL, our data came from a + // UDataMemory, which we must close when + // we are done. + + uint32_t fMemLimit; // Limit of available raw data space + u_atomic_int32_t fRefCount; + + // Confusable data + int32_t *fCFUKeys; + uint16_t *fCFUValues; + UChar *fCFUStrings; + + friend class ConfusabledataBuilder; +}; + +//--------------------------------------------------------------------------------------- +// +// Raw Binary Data Formats, as loaded from the ICU data file, +// or as built by the builder. +// +//--------------------------------------------------------------------------------------- +struct SpoofDataHeader { + int32_t fMagic; // (0x3845fdef) + uint8_t fFormatVersion[4]; // Data Format. Same as the value in struct UDataInfo + // if there is one associated with this data. + int32_t fLength; // Total length in bytes of this spoof data, + // including all sections, not just the header. + + // The following four sections refer to data representing the confusable data + // from the Unicode.org data from "confusables.txt" + + int32_t fCFUKeys; // byte offset to Keys table (from SpoofDataHeader *) + int32_t fCFUKeysSize; // number of entries in keys table (32 bits each) + + // TODO: change name to fCFUValues, for consistency. + int32_t fCFUStringIndex; // byte offset to String Indexes table + int32_t fCFUStringIndexSize; // number of entries in String Indexes table (16 bits each) + // (number of entries must be same as in Keys table + + int32_t fCFUStringTable; // byte offset of String table + int32_t fCFUStringTableLen; // length of string table (in 16 bit UChars) + + // The following sections are for data from xidmodifications.txt + + int32_t unused[15]; // Padding, Room for Expansion +}; + + + +U_NAMESPACE_END +#endif /* __cplusplus */ + +/** + * Endianness swap function for binary spoof data. + * @internal + */ +U_CAPI int32_t U_EXPORT2 +uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, + UErrorCode *status); + + +#endif + +#endif /* USPOOFIM_H */ + diff --git a/thirdparty/icu4c/icudt70l.dat b/thirdparty/icu4c/icudt71l.dat Binary files differindex 7932bc2ccd..3fa3af9c23 100644 --- a/thirdparty/icu4c/icudt70l.dat +++ b/thirdparty/icu4c/icudt71l.dat |