diff options
Diffstat (limited to 'thirdparty/icu4c/common')
36 files changed, 556 insertions, 289 deletions
diff --git a/thirdparty/icu4c/common/brkeng.cpp b/thirdparty/icu4c/common/brkeng.cpp index 52e9c53621..dc9fb99bf1 100644 --- a/thirdparty/icu4c/common/brkeng.cpp +++ b/thirdparty/icu4c/common/brkeng.cpp @@ -79,6 +79,7 @@ UnhandledEngine::findBreaks( UText *text, int32_t /* startPos */, int32_t endPos, UVector32 &/*foundBreaks*/, + UBool /* isPhraseBreaking */, UErrorCode &status) const { if (U_FAILURE(status)) return 0; UChar32 c = utext_current32(text); diff --git a/thirdparty/icu4c/common/brkeng.h b/thirdparty/icu4c/common/brkeng.h index 6843f1cc95..127ba59e18 100644 --- a/thirdparty/icu4c/common/brkeng.h +++ b/thirdparty/icu4c/common/brkeng.h @@ -75,6 +75,7 @@ class LanguageBreakEngine : public UMemory { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode &status) const = 0; }; @@ -194,6 +195,7 @@ class UnhandledEngine : public LanguageBreakEngine { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode &status) const override; /** diff --git a/thirdparty/icu4c/common/brkiter.cpp b/thirdparty/icu4c/common/brkiter.cpp index 8b228acf2c..8a1915880e 100644 --- a/thirdparty/icu4c/common/brkiter.cpp +++ b/thirdparty/icu4c/common/brkiter.cpp @@ -30,6 +30,7 @@ #include "unicode/ures.h" #include "unicode/ustring.h" #include "unicode/filteredbrk.h" +#include "bytesinkutil.h" #include "ucln_cmn.h" #include "cstring.h" #include "umutex.h" @@ -115,7 +116,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st } // Create a RuleBasedBreakIterator - result = new RuleBasedBreakIterator(file, status); + result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != NULL, status); // If there is a result, set the valid locale and actual locale, and the kind if (U_SUCCESS(status) && result != NULL) { @@ -408,7 +409,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) if (U_FAILURE(status)) { return NULL; } - char lbType[kKeyValueLenMax]; BreakIterator *result = NULL; switch (kind) { @@ -428,18 +428,29 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) break; case UBRK_LINE: { + char lb_lw[kKeyValueLenMax]; UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE); - uprv_strcpy(lbType, "line"); - char lbKeyValue[kKeyValueLenMax] = {0}; + uprv_strcpy(lb_lw, "line"); UErrorCode kvStatus = U_ZERO_ERROR; - int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus); - if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) { - uprv_strcat(lbType, "_"); - uprv_strcat(lbType, lbKeyValue); + CharString value; + CharStringByteSink valueSink(&value); + loc.getKeywordValue("lb", valueSink, kvStatus); + if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) { + uprv_strcat(lb_lw, "_"); + uprv_strcat(lb_lw, value.data()); } - result = BreakIterator::buildInstance(loc, lbType, status); + // lw=phrase is only supported in Japanese. + if (uprv_strcmp(loc.getLanguage(), "ja") == 0) { + value.clear(); + loc.getKeywordValue("lw", valueSink, kvStatus); + if (U_SUCCESS(kvStatus) && value == "phrase") { + uprv_strcat(lb_lw, "_"); + uprv_strcat(lb_lw, value.data()); + } + } + result = BreakIterator::buildInstance(loc, lb_lw, status); - UTRACE_DATA1(UTRACE_INFO, "lb=%s", lbKeyValue); + UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw); UTRACE_EXIT_STATUS(status); } break; diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp index 4d158e3226..4fdbdf2760 100644 --- a/thirdparty/icu4c/common/dictbe.cpp +++ b/thirdparty/icu4c/common/dictbe.cpp @@ -17,7 +17,10 @@ #include "dictbe.h" #include "unicode/uniset.h" #include "unicode/chariter.h" +#include "unicode/resbund.h" #include "unicode/ubrk.h" +#include "unicode/usetiter.h" +#include "ubrkimpl.h" #include "utracimp.h" #include "uvectr32.h" #include "uvector.h" @@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; (void)startPos; // TODO: remove this param? @@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text, } rangeStart = start; rangeEnd = current; - result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status); + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status); utext_setNativeIndex(text, current); return result; @@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai"); - fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); + UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fThaiWordSet); + setCharacters(thaiWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fThaiWordSet; + fEndWordSet = thaiWordSet; fEndWordSet.remove(0x0E31); // MAI HAN-AKAT fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK @@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; utext_setNativeIndex(text, rangeStart); @@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo"); - fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); + UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fLaoWordSet); + setCharacters(laoWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fLaoWordSet; + fEndWordSet = laoWordSet; fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) @@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { @@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr"); - fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); + fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels + fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.add(0x0020); if (U_SUCCESS(status)) { - setCharacters(fBurmeseWordSet); + setCharacters(fEndWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); - fMarkSet.add(0x0020); - fEndWordSet = fBurmeseWordSet; - fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching. fMarkSet.compact(); @@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { @@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); + UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fKhmerWordSet); + setCharacters(khmerWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fKhmerWordSet; + fEndWordSet = khmerWordSet; fBeginWordSet.add(0x1780, 0x17B3); //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word @@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { @@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType : DictionaryBreakEngine(), fDictionary(adoptDictionary) { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani"); - // Korean dictionary only includes Hangul syllables - fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); - fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); - fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); - fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); nfkcNorm2 = Normalizer2::getNFKCInstance(status); - - if (U_SUCCESS(status)) { - // handle Korean and Japanese/Chinese using different dictionaries - if (type == kKorean) { + // Korean dictionary only includes Hangul syllables + fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status); + fHangulWordSet.compact(); + // Digits, open puncutation and Alphabetic characters. + fDigitOrOpenPunctuationOrAlphabetSet.applyPattern( + UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status); + fDigitOrOpenPunctuationOrAlphabetSet.compact(); + fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status); + fClosePunctuationSet.compact(); + + // handle Korean and Japanese/Chinese using different dictionaries + if (type == kKorean) { + if (U_SUCCESS(status)) { setCharacters(fHangulWordSet); - } else { //Chinese and Japanese - UnicodeSet cjSet; - cjSet.addAll(fHanWordSet); - cjSet.addAll(fKatakanaWordSet); - cjSet.addAll(fHiraganaWordSet); - cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK - cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK + } + } else { //Chinese and Japanese + UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status); + if (U_SUCCESS(status)) { setCharacters(cjSet); + initJapanesePhraseParameter(status); } } UTRACE_EXIT_STATUS(status); @@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) { (value >= 0xFF66 && value <= 0xFF9f); } - // Function for accessing internal utext flags. // Replicates an internal UText function. static inline int32_t utext_i32_flag(int32_t bitIndex) { return (int32_t)1 << bitIndex; } - /* * @param text A UText representing the text @@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if (rangeStart >= rangeEnd) { @@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { t_boundary.addElement(numCodePts, status); numBreaks++; + } else if (isPhraseBreaking) { + t_boundary.addElement(numCodePts, status); + if(U_SUCCESS(status)) { + numBreaks++; + int32_t prevIdx = numCodePts; + + int32_t codeUnitIdx = -1; + int32_t prevCodeUnitIdx = -1; + int32_t length = -1; + for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) { + codeUnitIdx = inString.moveIndex32(0, i); + prevCodeUnitIdx = inString.moveIndex32(0, prevIdx); + // Calculate the length by using the code unit. + length = prevCodeUnitIdx - codeUnitIdx; + prevIdx = i; + // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana + // characters don't occur. + if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length)) + && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1))) + || !isKatakana(inString.char32At(codeUnitIdx)))) { + t_boundary.addElement(i, status); + numBreaks++; + } + } + } } else { for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { t_boundary.addElement(i, status); @@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, // while reversing t_boundary and pushing values to foundBreaks. int32_t prevCPPos = -1; int32_t prevUTextPos = -1; - for (int32_t i = numBreaks-1; i >= 0; i--) { + int32_t correctedNumBreaks = 0; + for (int32_t i = numBreaks - 1; i >= 0; i--) { int32_t cpPos = t_boundary.elementAti(i); U_ASSERT(cpPos > prevCPPos); int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; @@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if (utextPos > prevUTextPos) { // Boundaries are added to foundBreaks output in ascending order. U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); - foundBreaks.push(utextPos, status); + // In phrase breaking, there has to be a breakpoint between Cj character and close + // punctuation. + // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 + if (utextPos != rangeStart + || (isPhraseBreaking && utextPos > 0 + && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) { + foundBreaks.push(utextPos, status); + correctedNumBreaks++; + } } else { // Normalization expanded the input text, the dictionary found a boundary // within the expansion, giving two boundaries with the same index in the @@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } (void)prevCPPos; // suppress compiler warnings about unused variable + UChar32 nextChar = utext_char32At(inText, rangeEnd); + if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { + // In phrase breaking, there has to be a breakpoint between Cj character and + // the number/open punctuation. + // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 + // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 + // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U + if (isPhraseBreaking) { + if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) { + foundBreaks.popi(); + correctedNumBreaks--; + } + } else { + foundBreaks.popi(); + correctedNumBreaks--; + } + } + // inString goes out of scope // inputMap goes out of scope - return numBreaks; + return correctedNumBreaks; +} + +void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) { + loadJapaneseExtensions(error); + loadHiragana(error); +} + +void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) { + const char* tag = "extensions"; + ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error); + if (U_SUCCESS(error)) { + ResourceBundle bundle = ja.get(tag, error); + while (U_SUCCESS(error) && bundle.hasNext()) { + fSkipSet.puti(bundle.getNextString(error), 1, error); + } + } +} + +void CjkBreakEngine::loadHiragana(UErrorCode& error) { + UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error); + hiraganaWordSet.compact(); + UnicodeSetIterator iterator(hiraganaWordSet); + while (iterator.next()) { + fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error); + } } #endif diff --git a/thirdparty/icu4c/common/dictbe.h b/thirdparty/icu4c/common/dictbe.h index 4e70ed3817..ca1a3c28b7 100644 --- a/thirdparty/icu4c/common/dictbe.h +++ b/thirdparty/icu4c/common/dictbe.h @@ -15,6 +15,7 @@ #include "unicode/utext.h" #include "brkeng.h" +#include "hash.h" #include "uvectr32.h" U_NAMESPACE_BEGIN @@ -80,6 +81,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status ) const override; protected: @@ -105,6 +107,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const = 0; }; @@ -127,7 +130,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fThaiWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fSuffixSet; @@ -164,6 +166,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -186,7 +189,6 @@ class LaoBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fLaoWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -222,6 +224,7 @@ class LaoBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -244,7 +247,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fBurmeseWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -280,6 +282,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -302,7 +305,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fKhmerWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -338,6 +340,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -366,13 +369,22 @@ class CjkBreakEngine : public DictionaryBreakEngine { * @internal */ UnicodeSet fHangulWordSet; - UnicodeSet fHanWordSet; - UnicodeSet fKatakanaWordSet; - UnicodeSet fHiraganaWordSet; + UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; + UnicodeSet fClosePunctuationSet; DictionaryMatcher *fDictionary; const Normalizer2 *nfkcNorm2; + private: + // Load Japanese extensions. + void loadJapaneseExtensions(UErrorCode& error); + // Load Japanese Hiragana. + void loadHiragana(UErrorCode& error); + // Initialize fSkipSet by loading Japanese Hiragana and extensions. + void initJapanesePhraseParameter(UErrorCode& error); + + Hashtable fSkipSet; + public: /** @@ -404,6 +416,7 @@ class CjkBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; diff --git a/thirdparty/icu4c/common/localematcher.cpp b/thirdparty/icu4c/common/localematcher.cpp index 3d178dfbaf..2cad708d99 100644 --- a/thirdparty/icu4c/common/localematcher.cpp +++ b/thirdparty/icu4c/common/localematcher.cpp @@ -168,12 +168,9 @@ void LocaleMatcher::Builder::clearSupportedLocales() { bool LocaleMatcher::Builder::ensureSupportedLocaleVector() { if (U_FAILURE(errorCode_)) { return false; } if (supportedLocales_ != nullptr) { return true; } - supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_); + LocalPointer<UVector> lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_); if (U_FAILURE(errorCode_)) { return false; } - if (supportedLocales_ == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - return false; - } + supportedLocales_ = lpSupportedLocales.orphan(); return true; } @@ -187,9 +184,8 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin for (int32_t i = 0; i < length; ++i) { Locale *locale = list.orphanLocaleAt(i); if (locale == nullptr) { continue; } - supportedLocales_->addElementX(locale, errorCode_); + supportedLocales_->adoptElement(locale, errorCode_); if (U_FAILURE(errorCode_)) { - delete locale; break; } } @@ -197,35 +193,21 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin } LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) { - if (U_FAILURE(errorCode_)) { return *this; } - clearSupportedLocales(); - if (!ensureSupportedLocaleVector()) { return *this; } - while (locales.hasNext()) { - const Locale &locale = locales.next(); - Locale *clone = locale.clone(); - if (clone == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - break; - } - supportedLocales_->addElementX(clone, errorCode_); - if (U_FAILURE(errorCode_)) { - delete clone; - break; + if (ensureSupportedLocaleVector()) { + clearSupportedLocales(); + while (locales.hasNext() && U_SUCCESS(errorCode_)) { + const Locale &locale = locales.next(); + LocalPointer<Locale> clone (locale.clone(), errorCode_); + supportedLocales_->adoptElement(clone.orphan(), errorCode_); } } return *this; } LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) { - if (!ensureSupportedLocaleVector()) { return *this; } - Locale *clone = locale.clone(); - if (clone == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - return *this; - } - supportedLocales_->addElementX(clone, errorCode_); - if (U_FAILURE(errorCode_)) { - delete clone; + if (ensureSupportedLocaleVector()) { + LocalPointer<Locale> clone(locale.clone(), errorCode_); + supportedLocales_->adoptElement(clone.orphan(), errorCode_); } return *this; } diff --git a/thirdparty/icu4c/common/locid.cpp b/thirdparty/icu4c/common/locid.cpp index e8859c7048..73bb8d8aec 100644 --- a/thirdparty/icu4c/common/locid.cpp +++ b/thirdparty/icu4c/common/locid.cpp @@ -1204,14 +1204,11 @@ AliasReplacer::parseLanguageReplacement( // We have multiple field so we have to allocate and parse CharString* str = new CharString( replacement, (int32_t)uprv_strlen(replacement), status); + LocalPointer<CharString> lpStr(str, status); + toBeFreed.adoptElement(lpStr.orphan(), status); if (U_FAILURE(status)) { return; } - if (str == nullptr) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - toBeFreed.addElementX(str, status); char* data = str->data(); replacedLanguage = (const char*) data; char* endOfField = uprv_strchr(data, '_'); @@ -1420,12 +1417,9 @@ AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status) (int32_t)(firstSpace - replacement), status), status); } if (U_FAILURE(status)) { return false; } - if (item.isNull()) { - status = U_MEMORY_ALLOCATION_ERROR; - return false; - } replacedRegion = item->data(); - toBeFreed.addElementX(item.orphan(), status); + toBeFreed.adoptElement(item.orphan(), status); + if (U_FAILURE(status)) { return false; } } U_ASSERT(!same(region, replacedRegion)); region = replacedRegion; @@ -1659,10 +1653,10 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr && U_SUCCESS(status)) { *end = NULL_CHAR; // null terminate inside variantsBuff - variants.addElementX(start, status); + variants.addElement(start, status); start = end + 1; } - variants.addElementX(start, status); + variants.addElement(start, status); } if (U_FAILURE(status)) { return false; } diff --git a/thirdparty/icu4c/common/lstmbe.cpp b/thirdparty/icu4c/common/lstmbe.cpp index 3793abceb3..f6114cdfe2 100644 --- a/thirdparty/icu4c/common/lstmbe.cpp +++ b/thirdparty/icu4c/common/lstmbe.cpp @@ -1,8 +1,8 @@ // © 2021 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html +#include <complex> #include <utility> -#include <ctgmath> #include "unicode/utypes.h" @@ -639,6 +639,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; int32_t beginFoundBreakSize = foundBreaks.size(); diff --git a/thirdparty/icu4c/common/lstmbe.h b/thirdparty/icu4c/common/lstmbe.h index c3f7ecf815..ffdf805eca 100644 --- a/thirdparty/icu4c/common/lstmbe.h +++ b/thirdparty/icu4c/common/lstmbe.h @@ -62,6 +62,7 @@ protected: int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; private: const LSTMData* fData; diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp index 5bfd49e8cb..e6bd75e717 100644 --- a/thirdparty/icu4c/common/normalizer2impl.cpp +++ b/thirdparty/icu4c/common/normalizer2impl.cpp @@ -2496,15 +2496,18 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode // origin is not the first character, or it is U+0000. UnicodeSet *set; if((canonValue&CANON_HAS_SET)==0) { - set=new UnicodeSet; - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; + LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode); + set=lpSet.getAlias(); + if(U_FAILURE(errorCode)) { return; } UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode); - canonStartSets.addElementX(set, errorCode); + canonStartSets.adoptElement(lpSet.orphan(), errorCode); + if (U_FAILURE(errorCode)) { + return; + } if(firstOrigin!=0) { set->add(firstOrigin); } diff --git a/thirdparty/icu4c/common/rbbi.cpp b/thirdparty/icu4c/common/rbbi.cpp index f65177f232..cae8d154b3 100644 --- a/thirdparty/icu4c/common/rbbi.cpp +++ b/thirdparty/icu4c/common/rbbi.cpp @@ -82,6 +82,19 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode } } +//------------------------------------------------------------------------------- +// +// Constructor from a UDataMemory handle to precompiled break rules +// stored in an ICU data file. This construcotr is private API, +// only for internal use. +// +//------------------------------------------------------------------------------- +RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, + UErrorCode &status) : RuleBasedBreakIterator(udm, status) +{ + fIsPhraseBreaking = isPhraseBreaking; +} + // // Construct from precompiled binary rules (tables). This constructor is public API, // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). @@ -322,6 +335,7 @@ void RuleBasedBreakIterator::init(UErrorCode &status) { fBreakCache = nullptr; fDictionaryCache = nullptr; fLookAheadMatches = nullptr; + fIsPhraseBreaking = false; // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. // fText = UTEXT_INITIALIZER; diff --git a/thirdparty/icu4c/common/rbbi_cache.cpp b/thirdparty/icu4c/common/rbbi_cache.cpp index 6bfe3feca4..26d82df781 100644 --- a/thirdparty/icu4c/common/rbbi_cache.cpp +++ b/thirdparty/icu4c/common/rbbi_cache.cpp @@ -163,7 +163,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != NULL) { - foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, status); + foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); } // Reload the loop variables for the next go-round diff --git a/thirdparty/icu4c/common/serv.cpp b/thirdparty/icu4c/common/serv.cpp index 0c54a4dce9..c26dbca1a9 100644 --- a/thirdparty/icu4c/common/serv.cpp +++ b/thirdparty/icu4c/common/serv.cpp @@ -625,10 +625,7 @@ ICUService::getVisibleIDs(UVector& result, const UnicodeString* matchID, UErrorC } } - LocalPointer<UnicodeString> idClone(new UnicodeString(*id), status); - if (U_SUCCESS(status) && idClone->isBogus()) { - status = U_MEMORY_ALLOCATION_ERROR; - } + LocalPointer<UnicodeString> idClone(id->clone(), status); result.adoptElement(idClone.orphan(), status); } delete fallbackKey; diff --git a/thirdparty/icu4c/common/servls.cpp b/thirdparty/icu4c/common/servls.cpp index 7108afd4a5..98f0a8a12b 100644 --- a/thirdparty/icu4c/common/servls.cpp +++ b/thirdparty/icu4c/common/servls.cpp @@ -179,7 +179,8 @@ private: length = other._ids.size(); for(i = 0; i < length; ++i) { - _ids.addElementX(((UnicodeString *)other._ids.elementAt(i))->clone(), status); + LocalPointer<UnicodeString> clonedId(((UnicodeString *)other._ids.elementAt(i))->clone(), status); + _ids.adoptElement(clonedId.orphan(), status); } if(U_SUCCESS(status)) { diff --git a/thirdparty/icu4c/common/servnotf.cpp b/thirdparty/icu4c/common/servnotf.cpp index 342e0d9f24..d9fb388752 100644 --- a/thirdparty/icu4c/common/servnotf.cpp +++ b/thirdparty/icu4c/common/servnotf.cpp @@ -49,7 +49,11 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status) if (acceptsListener(*l)) { Mutex lmx(¬ifyLock); if (listeners == NULL) { - listeners = new UVector(5, status); + LocalPointer<UVector> lpListeners(new UVector(5, status), status); + if (U_FAILURE(status)) { + return; + } + listeners = lpListeners.orphan(); } else { for (int i = 0, e = listeners->size(); i < e; ++i) { const EventListener* el = (const EventListener*)(listeners->elementAt(i)); @@ -59,7 +63,7 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status) } } - listeners->addElementX((void*)l, status); // cast away const + listeners->addElement((void*)l, status); // cast away const } #ifdef NOTIFIER_DEBUG else { @@ -102,13 +106,11 @@ ICUNotifier::removeListener(const EventListener *l, UErrorCode& status) void ICUNotifier::notifyChanged(void) { + Mutex lmx(¬ifyLock); if (listeners != NULL) { - Mutex lmx(¬ifyLock); - if (listeners != NULL) { - for (int i = 0, e = listeners->size(); i < e; ++i) { - EventListener* el = (EventListener*)listeners->elementAt(i); - notifyListener(*el); - } + for (int i = 0, e = listeners->size(); i < e; ++i) { + EventListener* el = (EventListener*)listeners->elementAt(i); + notifyListener(*el); } } } diff --git a/thirdparty/icu4c/common/ubrk.cpp b/thirdparty/icu4c/common/ubrk.cpp index bb5bdd1b50..f4e064961f 100644 --- a/thirdparty/icu4c/common/ubrk.cpp +++ b/thirdparty/icu4c/common/ubrk.cpp @@ -168,7 +168,7 @@ ubrk_safeClone( BreakIterator *newBI = ((BreakIterator *)bi)->clone(); if (newBI == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; - } else { + } else if (pBufferSize != NULL) { *status = U_SAFECLONE_ALLOCATED_WARNING; } return (UBreakIterator *)newBI; @@ -176,15 +176,7 @@ ubrk_safeClone( U_CAPI UBreakIterator * U_EXPORT2 ubrk_clone(const UBreakIterator *bi, UErrorCode *status) { - if (U_FAILURE(*status)) { - return nullptr; - } - BreakIterator *newBI = ((BreakIterator *)bi)->clone(); - if (newBI == nullptr) { - *status = U_MEMORY_ALLOCATION_ERROR; - return nullptr; - } - return (UBreakIterator *)newBI; + return ubrk_safeClone(bi, nullptr, nullptr, status); } diff --git a/thirdparty/icu4c/common/ucase.cpp b/thirdparty/icu4c/common/ucase.cpp index 4aa856507a..388c86b1bb 100644 --- a/thirdparty/icu4c/common/ucase.cpp +++ b/thirdparty/icu4c/common/ucase.cpp @@ -22,27 +22,14 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/uset.h" -#include "unicode/udata.h" /* UDataInfo */ #include "unicode/utf16.h" -#include "ucmndata.h" /* DataHeader */ -#include "udatamem.h" -#include "umutex.h" -#include "uassert.h" #include "cmemory.h" -#include "utrie2.h" +#include "uassert.h" #include "ucase.h" +#include "umutex.h" +#include "utrie2.h" -struct UCaseProps { - UDataMemory *mem; - const int32_t *indexes; - const uint16_t *exceptions; - const uint16_t *unfold; - - UTrie2 trie; - uint8_t formatVersion[4]; -}; - -/* ucase_props_data.h is machine-generated by gencase --csource */ +/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */ #define INCLUDED_FROM_UCASE_CPP #include "ucase_props_data.h" @@ -77,6 +64,13 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { /* data access primitives --------------------------------------------------- */ +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) { + *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions); + *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold); + return &ucase_props_singleton; +} + U_CFUNC const UTrie2 * U_EXPORT2 ucase_getTrie() { return &ucase_props_singleton.trie; @@ -690,7 +684,7 @@ ucase_isCaseSensitive(UChar32 c) { * - The general category of C is * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or * Letter Modifier (Lm), or Symbol Modifier (Sk) - * - C is one of the following characters + * - C is one of the following characters * U+0027 APOSTROPHE * U+00AD SOFT HYPHEN (SHY) * U+2019 RIGHT SINGLE QUOTATION MARK @@ -1064,6 +1058,8 @@ ucase_toFullLower(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_IS_UPPER_OR_TITLE(props)) { @@ -1148,7 +1144,6 @@ ucase_toFullLower(UChar32 c, 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ - *pString=nullptr; return 0; /* remove the dot (continue without output) */ } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) { /* @@ -1215,6 +1210,8 @@ toUpperOrTitle(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { @@ -1252,7 +1249,6 @@ toUpperOrTitle(UChar32 c, 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE */ - *pString=nullptr; return 0; /* remove the dot (continue without output) */ } else if(c==0x0587) { // See ICU-13416: @@ -1449,6 +1445,8 @@ ucase_toFullFolding(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_IS_UPPER_OR_TITLE(props)) { @@ -1542,7 +1540,7 @@ U_CAPI UChar32 U_EXPORT2 u_tolower(UChar32 c) { return ucase_tolower(c); } - + /* Transforms the Unicode character to its upper case equivalent.*/ U_CAPI UChar32 U_EXPORT2 u_toupper(UChar32 c) { diff --git a/thirdparty/icu4c/common/ucase.h b/thirdparty/icu4c/common/ucase.h index a018f82b81..7bf57fd370 100644 --- a/thirdparty/icu4c/common/ucase.h +++ b/thirdparty/icu4c/common/ucase.h @@ -312,6 +312,21 @@ UCaseMapFull(UChar32 c, U_CDECL_END +/* for icuexportdata -------------------------------------------------------- */ + +struct UCaseProps { + void *mem; // TODO: was unused, and type UDataMemory -- remove + const int32_t *indexes; + const uint16_t *exceptions; + const uint16_t *unfold; + + UTrie2 trie; + uint8_t formatVersion[4]; +}; + +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength); + /* file definitions --------------------------------------------------------- */ #define UCASE_DATA_NAME "ucase" diff --git a/thirdparty/icu4c/common/ucasemap.cpp b/thirdparty/icu4c/common/ucasemap.cpp index ed72bda828..95b55d56a0 100644 --- a/thirdparty/icu4c/common/ucasemap.cpp +++ b/thirdparty/icu4c/common/ucasemap.cpp @@ -112,8 +112,7 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { if(length==sizeof(csm->locale)) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } - if(U_SUCCESS(*pErrorCode)) { - csm->caseLocale=UCASE_LOC_UNKNOWN; + if(U_SUCCESS(*pErrorCode)) { csm->caseLocale = ucase_getCaseLocale(csm->locale); } else { csm->locale[0]=0; @@ -420,6 +419,97 @@ void toUpper(int32_t caseLocale, uint32_t options, #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0]; + +constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1]; + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit, + ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { + U_ASSERT(start < segmentLimit); + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar32 c2; + c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) { + withAcute = true; + unchanged1 = 2; // ACUTE is 2 code units in UTF-8 + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) { + return start; + } + if (doTitleJ) { + unchanged2 = 2; // ACUTE is 2 code units in UTF-8 + } else { + unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8 + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U8_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode); + start += unchanged1; + if (doTitleJ) { + ByteSinkUtil::appendCodePoint(1, u'J', sink, edits); + ++start; + } + ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC void U_CALLCONV ucasemap_internalUTF8ToTitle( int32_t caseLocale, uint32_t options, BreakIterator *iter, @@ -504,19 +594,14 @@ ucasemap_internalUTF8ToTitle( } /* Special case Dutch IJ titlecasing */ - if (titleStart+1 < index && - caseLocale == UCASE_LOC_DUTCH && - (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { - if (src[titleStart+1] == 0x006A) { - ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits); - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1, - sink, options, edits, errorCode)) { - return; - } - titleLimit++; + if (titleLimit < index && + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode); } } diff --git a/thirdparty/icu4c/common/ucnv.cpp b/thirdparty/icu4c/common/ucnv.cpp index 5dcf35e043..019bcb6a79 100644 --- a/thirdparty/icu4c/common/ucnv.cpp +++ b/thirdparty/icu4c/common/ucnv.cpp @@ -252,7 +252,10 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U UTRACE_EXIT_STATUS(*status); return NULL; } - *status = U_SAFECLONE_ALLOCATED_WARNING; + // If pBufferSize was NULL as the input, pBufferSize is set to &stackBufferSize in this function. + if (pBufferSize != &stackBufferSize) { + *status = U_SAFECLONE_ALLOCATED_WARNING; + } /* record the fact that memory was allocated */ *pBufferSize = bufferSizeNeeded; @@ -317,7 +320,11 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U return localConverter; } - +U_CAPI UConverter* U_EXPORT2 +ucnv_clone(const UConverter* cnv, UErrorCode *status) +{ + return ucnv_safeClone(cnv, nullptr, nullptr, status); +} /*Decreases the reference counter in the shared immutable section of the object *and frees the mutable part*/ diff --git a/thirdparty/icu4c/common/ucurr.cpp b/thirdparty/icu4c/common/ucurr.cpp index 67aab4e8ff..6e489e0563 100644 --- a/thirdparty/icu4c/common/ucurr.cpp +++ b/thirdparty/icu4c/common/ucurr.cpp @@ -254,7 +254,7 @@ currSymbolsEquiv_cleanup(void) } /** - * Deleter for OlsonToMetaMappingEntry + * Deleter for IsoCodeEntry */ static void U_CALLCONV deleteIsoCodeEntry(void *obj) { diff --git a/thirdparty/icu4c/common/uloc.cpp b/thirdparty/icu4c/common/uloc.cpp index c8a3f1ff73..99c6a0af39 100644 --- a/thirdparty/icu4c/common/uloc.cpp +++ b/thirdparty/icu4c/common/uloc.cpp @@ -186,10 +186,10 @@ NULL }; static const char* const DEPRECATED_LANGUAGES[]={ - "in", "iw", "ji", "jw", NULL, NULL + "in", "iw", "ji", "jw", "mo", NULL, NULL }; static const char* const REPLACEMENT_LANGUAGES[]={ - "id", "he", "yi", "jv", NULL, NULL + "id", "he", "yi", "jv", "ro", NULL, NULL }; /** @@ -444,7 +444,7 @@ static const char * const COUNTRIES_3[] = { /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */ "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF", /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */ - "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE", + "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE", NULL, /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */ "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR", diff --git a/thirdparty/icu4c/common/unicode/localematcher.h b/thirdparty/icu4c/common/unicode/localematcher.h index 252bb7fdc2..0f7e04a3af 100644 --- a/thirdparty/icu4c/common/unicode/localematcher.h +++ b/thirdparty/icu4c/common/unicode/localematcher.h @@ -461,13 +461,13 @@ public: * Option for whether to include or ignore one-way (fallback) match data. * By default, they are included. * - * @param direction the match direction to set. + * @param matchDirection the match direction to set. * @return this Builder object * @stable ICU 67 */ - Builder &setDirection(ULocMatchDirection direction) { + Builder &setDirection(ULocMatchDirection matchDirection) { if (U_SUCCESS(errorCode_)) { - direction_ = direction; + direction_ = matchDirection; } return *this; } diff --git a/thirdparty/icu4c/common/unicode/rbbi.h b/thirdparty/icu4c/common/unicode/rbbi.h index 0ce93819f5..0bad0d3897 100644 --- a/thirdparty/icu4c/common/unicode/rbbi.h +++ b/thirdparty/icu4c/common/unicode/rbbi.h @@ -147,6 +147,11 @@ private: */ int32_t *fLookAheadMatches; + /** + * A flag to indicate if phrase based breaking is enabled. + */ + UBool fIsPhraseBreaking; + //======================================================================= // constructors //======================================================================= @@ -163,6 +168,21 @@ private: */ RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); + /** + * This constructor uses the udata interface to create a BreakIterator + * whose internal tables live in a memory-mapped file. "image" is an + * ICU UDataMemory handle for the pre-compiled break iterator tables. + * @param image handle to the memory image for the break iterator data. + * Ownership of the UDataMemory handle passes to the Break Iterator, + * which will be responsible for closing it when it is no longer needed. + * @param status Information on any errors encountered. + * @param isPhraseBreaking true if phrase based breaking is required, otherwise false. + * @see udata_open + * @see #getBinaryRules + * @internal (private) + */ + RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status); + /** @internal */ friend class RBBIRuleBuilder; /** @internal */ diff --git a/thirdparty/icu4c/common/unicode/ubrk.h b/thirdparty/icu4c/common/unicode/ubrk.h index c603f7c13f..2b3dc7aa57 100644 --- a/thirdparty/icu4c/common/unicode/ubrk.h +++ b/thirdparty/icu4c/common/unicode/ubrk.h @@ -312,11 +312,12 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength, * If *pBufferSize is not enough for a stack-based safe clone, * new memory will be allocated. * @param status to indicate whether the operation went on smoothly or there were errors - * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. + * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used + * if pBufferSize != NULL and any allocations were necessary * @return pointer to the new clone * @deprecated ICU 69 Use ubrk_clone() instead. */ -U_CAPI UBreakIterator * U_EXPORT2 +U_DEPRECATED UBreakIterator * U_EXPORT2 ubrk_safeClone( const UBreakIterator *bi, void *stackBuffer, @@ -325,21 +326,17 @@ ubrk_safeClone( #endif /* U_HIDE_DEPRECATED_API */ -#ifndef U_HIDE_DRAFT_API - /** * Thread safe cloning operation. * @param bi iterator to be cloned * @param status to indicate whether the operation went on smoothly or there were errors * @return pointer to the new clone - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI UBreakIterator * U_EXPORT2 ubrk_clone(const UBreakIterator *bi, UErrorCode *status); -#endif // U_HIDE_DRAFT_API - #ifndef U_HIDE_DEPRECATED_API /** diff --git a/thirdparty/icu4c/common/unicode/ucnv.h b/thirdparty/icu4c/common/unicode/ucnv.h index 2687c984d4..20c173b662 100644 --- a/thirdparty/icu4c/common/unicode/ucnv.h +++ b/thirdparty/icu4c/common/unicode/ucnv.h @@ -477,7 +477,7 @@ ucnv_openCCSID(int32_t codepage, * * <p>The name will NOT be looked up in the alias mechanism, nor will the converter be * stored in the converter cache or the alias table. The only way to open further converters - * is call this function multiple times, or use the ucnv_safeClone() function to clone a + * is call this function multiple times, or use the ucnv_clone() function to clone a * 'primary' converter.</p> * * <p>A future version of ICU may add alias table lookups and/or caching @@ -493,7 +493,7 @@ ucnv_openCCSID(int32_t codepage, * @return the created Unicode converter object, or <TT>NULL</TT> if an error occurred * @see udata_open * @see ucnv_open - * @see ucnv_safeClone + * @see ucnv_clone * @see ucnv_close * @stable ICU 2.2 */ @@ -502,6 +502,20 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode /** * Thread safe converter cloning operation. + * + * You must ucnv_close() the clone. + * + * @param cnv converter to be cloned + * @param status to indicate whether the operation went on smoothly or there were errors + * @return pointer to the new clone + * @stable ICU 71 + */ +U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *status); + +#ifndef U_HIDE_DEPRECATED_API + +/** + * Thread safe converter cloning operation. * For most efficient operation, pass in a stackBuffer (and a *pBufferSize) * with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space. * If the buffer size is sufficient, then the clone will use the stack buffer; @@ -532,21 +546,19 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode * pointer to size of allocated space. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_WARNING, - * is used if any allocations were necessary. + * is used if pBufferSize != NULL and any allocations were necessary * However, it is better to check if *pBufferSize grew for checking for * allocations because warning codes can be overridden by subsequent * function calls. * @return pointer to the new clone - * @stable ICU 2.0 + * @deprecated ICU 71 Use ucnv_clone() instead. */ -U_CAPI UConverter * U_EXPORT2 +U_DEPRECATED UConverter * U_EXPORT2 ucnv_safeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); -#ifndef U_HIDE_DEPRECATED_API - /** * \def U_CNV_SAFECLONE_BUFFERSIZE * Definition of a buffer size that is designed to be large enough for diff --git a/thirdparty/icu4c/common/unicode/uniset.h b/thirdparty/icu4c/common/unicode/uniset.h index 730337a353..310c7c8d20 100644 --- a/thirdparty/icu4c/common/unicode/uniset.h +++ b/thirdparty/icu4c/common/unicode/uniset.h @@ -1229,7 +1229,6 @@ public: */ UnicodeSet& retain(UChar32 c); -#ifndef U_HIDE_DRAFT_API /** * Retains only the specified string from this set if it is present. * Upon return this set will be empty if it did not contain s, or @@ -1238,10 +1237,9 @@ public: * * @param s the source string * @return this object, for chaining - * @draft ICU 69 + * @stable ICU 69 */ UnicodeSet& retain(const UnicodeString &s); -#endif // U_HIDE_DRAFT_API /** * Removes the specified range from this set if it is present. diff --git a/thirdparty/icu4c/common/unicode/urename.h b/thirdparty/icu4c/common/unicode/urename.h index 4605f632ea..d9f9b8f336 100644 --- a/thirdparty/icu4c/common/unicode/urename.h +++ b/thirdparty/icu4c/common/unicode/urename.h @@ -567,6 +567,7 @@ #define ucase_addStringCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addStringCaseClosure) #define ucase_fold U_ICU_ENTRY_POINT_RENAME(ucase_fold) #define ucase_getCaseLocale U_ICU_ENTRY_POINT_RENAME(ucase_getCaseLocale) +#define ucase_getSingleton U_ICU_ENTRY_POINT_RENAME(ucase_getSingleton) #define ucase_getTrie U_ICU_ENTRY_POINT_RENAME(ucase_getTrie) #define ucase_getType U_ICU_ENTRY_POINT_RENAME(ucase_getType) #define ucase_getTypeOrIgnorable U_ICU_ENTRY_POINT_RENAME(ucase_getTypeOrIgnorable) @@ -630,6 +631,7 @@ #define ucnv_cbFromUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteUChars) #define ucnv_cbToUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteSub) #define ucnv_cbToUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteUChars) +#define ucnv_clone U_ICU_ENTRY_POINT_RENAME(ucnv_clone) #define ucnv_close U_ICU_ENTRY_POINT_RENAME(ucnv_close) #define ucnv_compareNames U_ICU_ENTRY_POINT_RENAME(ucnv_compareNames) #define ucnv_convert U_ICU_ENTRY_POINT_RENAME(ucnv_convert) @@ -725,6 +727,7 @@ #define ucnvsel_selectForString U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForString) #define ucnvsel_selectForUTF8 U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForUTF8) #define ucnvsel_serialize U_ICU_ENTRY_POINT_RENAME(ucnvsel_serialize) +#define ucol_clone U_ICU_ENTRY_POINT_RENAME(ucol_clone) #define ucol_cloneBinary U_ICU_ENTRY_POINT_RENAME(ucol_cloneBinary) #define ucol_close U_ICU_ENTRY_POINT_RENAME(ucol_close) #define ucol_closeElements U_ICU_ENTRY_POINT_RENAME(ucol_closeElements) @@ -904,6 +907,7 @@ #define udatpg_getBestPattern U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPattern) #define udatpg_getBestPatternWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPatternWithOptions) #define udatpg_getDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormat) +#define udatpg_getDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormatForStyle) #define udatpg_getDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_getDecimal) #define udatpg_getDefaultHourCycle U_ICU_ENTRY_POINT_RENAME(udatpg_getDefaultHourCycle) #define udatpg_getFieldDisplayName U_ICU_ENTRY_POINT_RENAME(udatpg_getFieldDisplayName) @@ -918,6 +922,7 @@ #define udatpg_setAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemFormat) #define udatpg_setAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemName) #define udatpg_setDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormat) +#define udatpg_setDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormatForStyle) #define udatpg_setDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_setDecimal) #define udict_swap U_ICU_ENTRY_POINT_RENAME(udict_swap) #define udtitvfmt_close U_ICU_ENTRY_POINT_RENAME(udtitvfmt_close) diff --git a/thirdparty/icu4c/common/unicode/uset.h b/thirdparty/icu4c/common/unicode/uset.h index 2ef352ef56..33332f2d36 100644 --- a/thirdparty/icu4c/common/unicode/uset.h +++ b/thirdparty/icu4c/common/unicode/uset.h @@ -628,7 +628,6 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end); U_CAPI void U_EXPORT2 uset_removeString(USet* set, const UChar* str, int32_t strLen); -#ifndef U_HIDE_DRAFT_API /** * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} * A frozen set will not be modified. @@ -636,11 +635,10 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Removes from this set all of its elements that are contained in the @@ -671,7 +669,6 @@ uset_removeAll(USet* set, const USet* removeSet); U_CAPI void U_EXPORT2 uset_retain(USet* set, UChar32 start, UChar32 end); -#ifndef U_HIDE_DRAFT_API /** * Retains only the specified string from this set if it is present. * Upon return this set will be empty if it did not contain s, or @@ -681,7 +678,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainString(USet *set, const UChar *str, int32_t length); @@ -693,11 +690,10 @@ uset_retainString(USet *set, const UChar *str, int32_t length); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Retains only the elements in this set that are contained in the @@ -741,7 +737,6 @@ uset_compact(USet* set); U_CAPI void U_EXPORT2 uset_complement(USet* set); -#ifndef U_HIDE_DRAFT_API /** * Complements the specified range in this set. Any character in * the range will be removed if it is in this set, or will be @@ -753,7 +748,7 @@ uset_complement(USet* set); * @param set the object to be modified * @param start first character, inclusive, of range * @param end last character, inclusive, of range - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementRange(USet *set, UChar32 start, UChar32 end); @@ -766,7 +761,7 @@ uset_complementRange(USet *set, UChar32 start, UChar32 end); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementString(USet *set, const UChar *str, int32_t length); @@ -778,11 +773,10 @@ uset_complementString(USet *set, const UChar *str, int32_t length); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Complements in this set all elements contained in the specified diff --git a/thirdparty/icu4c/common/unicode/uvernum.h b/thirdparty/icu4c/common/unicode/uvernum.h index 42e8865d7e..2706e0b060 100644 --- a/thirdparty/icu4c/common/unicode/uvernum.h +++ b/thirdparty/icu4c/common/unicode/uvernum.h @@ -60,7 +60,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.4 */ -#define U_ICU_VERSION_MAJOR_NUM 70 +#define U_ICU_VERSION_MAJOR_NUM 71 /** The current ICU minor version as an integer. * This value will change in the subsequent releases of ICU @@ -86,7 +86,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.6 */ -#define U_ICU_VERSION_SUFFIX _70 +#define U_ICU_VERSION_SUFFIX _71 /** * \def U_DEF2_ICU_ENTRY_POINT_RENAME @@ -139,7 +139,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.4 */ -#define U_ICU_VERSION "70.1" +#define U_ICU_VERSION "71.1" /** * The current ICU library major version number as a string, for library name suffixes. @@ -152,13 +152,13 @@ * * @stable ICU 2.6 */ -#define U_ICU_VERSION_SHORT "70" +#define U_ICU_VERSION_SHORT "71" #ifndef U_HIDE_INTERNAL_API /** Data version in ICU4C. * @internal ICU 4.4 Internal Use Only **/ -#define U_ICU_DATA_VERSION "70.1" +#define U_ICU_DATA_VERSION "71.1" #endif /* U_HIDE_INTERNAL_API */ /*=========================================================================== diff --git a/thirdparty/icu4c/common/unistr.cpp b/thirdparty/icu4c/common/unistr.cpp index 077b4d6ef2..c18665928d 100644 --- a/thirdparty/icu4c/common/unistr.cpp +++ b/thirdparty/icu4c/common/unistr.cpp @@ -334,7 +334,8 @@ Replaceable::clone() const { // UnicodeString overrides clone() with a real implementation UnicodeString * UnicodeString::clone() const { - return new UnicodeString(*this); + LocalPointer<UnicodeString> clonedString(new UnicodeString(*this)); + return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr; } //======================================== @@ -1976,7 +1977,12 @@ The vector deleting destructor is already a part of UObject, but defining it here makes sure that it is included with this object file. This makes sure that static library dependencies are kept to a minimum. */ +#if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" static void uprv_UnicodeStringDummy(void) { delete [] (new UnicodeString[2]); } +#pragma GCC diagnostic pop +#endif #endif diff --git a/thirdparty/icu4c/common/ustrcase.cpp b/thirdparty/icu4c/common/ustrcase.cpp index 36b19e75f2..43910ea520 100644 --- a/thirdparty/icu4c/common/ustrcase.cpp +++ b/thirdparty/icu4c/common/ustrcase.cpp @@ -36,6 +36,12 @@ #include "ustr_imp.h" #include "uassert.h" +/** + * Code point for COMBINING ACUTE ACCENT + * @internal + */ +#define ACUTE u'\u0301' + U_NAMESPACE_BEGIN namespace { @@ -396,6 +402,94 @@ U_NAMESPACE_USE #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit, + UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options, + icu::Edits *edits) { + U_ASSERT(start < segmentLimit); + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE) { + withAcute = true; + unchanged1 = 1; + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if (index == segmentLimit || src[index++] != ACUTE) { return start; } + if (doTitleJ) { + unchanged2 = 1; + } else { + ++unchanged1; + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U16_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits); + start += unchanged1; + if (doTitleJ) { + destIndex = appendUChar(dest, destIndex, destCapacity, u'J'); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + ++start; + } + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC int32_t U_CALLCONV ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, UChar *dest, int32_t destCapacity, @@ -412,14 +506,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it csc.limit=srcLength; int32_t destIndex=0; int32_t prev=0; - UBool isFirstIndex=TRUE; + bool isFirstIndex=true; /* titlecasing loop */ while(prev<srcLength) { /* find next index where to titlecase */ int32_t index; if(isFirstIndex) { - isFirstIndex=FALSE; + isFirstIndex=false; index=iter->first(); } else { index=iter->next(); @@ -446,7 +540,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it // Stop with titleStart<titleLimit<=index // if there is a character to be titlecased, // or else stop with titleStart==titleLimit==index. - UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; + bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { titleStart=titleLimit; if(titleLimit==index) { @@ -479,27 +573,15 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it /* Special case Dutch IJ titlecasing */ if (titleStart+1 < index && - caseLocale == UCASE_LOC_DUTCH && - (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { - if (src[titleStart+1] == 0x006A) { - destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - if(edits!=NULL) { - edits->addReplace(1, 1); - } - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+titleStart+1, 1, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - titleLimit++; + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, + dest, destIndex, destCapacity, options, + edits); } } diff --git a/thirdparty/icu4c/common/uvector.cpp b/thirdparty/icu4c/common/uvector.cpp index 4da8b864e1..844463921e 100644 --- a/thirdparty/icu4c/common/uvector.cpp +++ b/thirdparty/icu4c/common/uvector.cpp @@ -99,14 +99,6 @@ bool UVector::operator==(const UVector& other) const { return true; } -// TODO: delete this function once all call sites have been migrated to the -// new addElement(). -void UVector::addElementX(void* obj, UErrorCode &status) { - if (ensureCapacityX(count + 1, status)) { - elements[count++].pointer = obj; - } -} - void UVector::addElement(void* obj, UErrorCode &status) { U_ASSERT(deleter == nullptr); if (ensureCapacity(count + 1, status)) { @@ -331,38 +323,6 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const { return -1; } -UBool UVector::ensureCapacityX(int32_t minimumCapacity, UErrorCode &status) { - if (minimumCapacity < 0) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - if (capacity < minimumCapacity) { - if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - int32_t newCap = capacity * 2; - if (newCap < minimumCapacity) { - newCap = minimumCapacity; - } - if (newCap > (int32_t)(INT32_MAX / sizeof(UElement))) { // integer overflow check - // We keep the original memory contents on bad minimumCapacity. - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - UElement* newElems = (UElement *)uprv_realloc(elements, sizeof(UElement)*newCap); - if (newElems == nullptr) { - // We keep the original contents on the memory failure on realloc or bad minimumCapacity. - status = U_MEMORY_ALLOCATION_ERROR; - return FALSE; - } - elements = newElems; - capacity = newCap; - } - return TRUE; -} - - UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { if (U_FAILURE(status)) { return false; @@ -370,7 +330,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { if (minimumCapacity < 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return false; - } + } if (capacity < minimumCapacity) { if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check status = U_ILLEGAL_ARGUMENT_ERROR; @@ -396,6 +356,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { } return true; } + /** * Change the size of this vector as follows: If newSize is smaller, * then truncate the array, possibly deleting held elements for i >= diff --git a/thirdparty/icu4c/common/uvector.h b/thirdparty/icu4c/common/uvector.h index f61fcc2be6..1eb7d136e7 100644 --- a/thirdparty/icu4c/common/uvector.h +++ b/thirdparty/icu4c/common/uvector.h @@ -123,12 +123,6 @@ public: // java.util.Vector API //------------------------------------------------------------ - /* - * Old version of addElement, with non-standard error handling. - * Will be removed once all uses have been switched to the new addElement(). - */ - void addElementX(void* obj, UErrorCode &status); - /** * Add an element at the end of the vector. * For use only with vectors that do not adopt their elements, which is to say, @@ -197,12 +191,6 @@ public: inline UBool isEmpty(void) const {return count == 0;} - /* - * Old version of ensureCapacity, with non-standard error handling. - * Will be removed once all uses have been switched to the new ensureCapacity(). - */ - UBool ensureCapacityX(int32_t minimumCapacity, UErrorCode &status); - UBool ensureCapacity(int32_t minimumCapacity, UErrorCode &status); /** diff --git a/thirdparty/icu4c/common/uvectr32.cpp b/thirdparty/icu4c/common/uvectr32.cpp index a77ecb689f..2b4d0b8a75 100644 --- a/thirdparty/icu4c/common/uvectr32.cpp +++ b/thirdparty/icu4c/common/uvectr32.cpp @@ -83,7 +83,7 @@ void UVector32::assign(const UVector32& other, UErrorCode &ec) { } -bool UVector32::operator==(const UVector32& other) { +bool UVector32::operator==(const UVector32& other) const { int32_t i; if (count != other.count) return false; for (i=0; i<count; ++i) { diff --git a/thirdparty/icu4c/common/uvectr32.h b/thirdparty/icu4c/common/uvectr32.h index f08c2ade76..ecefb7af3e 100644 --- a/thirdparty/icu4c/common/uvectr32.h +++ b/thirdparty/icu4c/common/uvectr32.h @@ -86,12 +86,12 @@ public: * equal if they are of the same size and all elements are equal, * as compared using this object's comparer. */ - bool operator==(const UVector32& other); + bool operator==(const UVector32& other) const; /** * Equivalent to !operator==() */ - inline bool operator!=(const UVector32& other); + inline bool operator!=(const UVector32& other) const; //------------------------------------------------------------ // java.util.Vector API @@ -268,7 +268,7 @@ inline int32_t UVector32::lastElementi(void) const { return elementAti(count-1); } -inline bool UVector32::operator!=(const UVector32& other) { +inline bool UVector32::operator!=(const UVector32& other) const { return !operator==(other); } |