From 93fba7ead33b45a6f9904ab6a69ada72e8564230 Mon Sep 17 00:00:00 2001 From: bruvzg <7645683+bruvzg@users.noreply.github.com> Date: Tue, 17 May 2022 18:14:19 +0300 Subject: Update HarfBuzz, ICU and FreeType. HarfBuzz: Update to version 4.2.1 FreeType: Update to version 2.12.1 ICU: Update to version 71.1 --- thirdparty/icu4c/LICENSE | 125 ++++++++++++++++-- thirdparty/icu4c/common/brkeng.cpp | 1 + thirdparty/icu4c/common/brkeng.h | 2 + thirdparty/icu4c/common/brkiter.cpp | 31 +++-- thirdparty/icu4c/common/dictbe.cpp | 165 ++++++++++++++++++------ thirdparty/icu4c/common/dictbe.h | 27 +++- thirdparty/icu4c/common/localematcher.cpp | 42 ++---- thirdparty/icu4c/common/locid.cpp | 18 +-- thirdparty/icu4c/common/lstmbe.cpp | 3 +- thirdparty/icu4c/common/lstmbe.h | 1 + thirdparty/icu4c/common/normalizer2impl.cpp | 11 +- thirdparty/icu4c/common/rbbi.cpp | 14 ++ thirdparty/icu4c/common/rbbi_cache.cpp | 2 +- thirdparty/icu4c/common/serv.cpp | 5 +- thirdparty/icu4c/common/servls.cpp | 3 +- thirdparty/icu4c/common/servnotf.cpp | 18 +-- thirdparty/icu4c/common/ubrk.cpp | 12 +- thirdparty/icu4c/common/ucase.cpp | 40 +++--- thirdparty/icu4c/common/ucase.h | 15 +++ thirdparty/icu4c/common/ucasemap.cpp | 115 ++++++++++++++--- thirdparty/icu4c/common/ucnv.cpp | 11 +- thirdparty/icu4c/common/ucurr.cpp | 2 +- thirdparty/icu4c/common/uloc.cpp | 6 +- thirdparty/icu4c/common/unicode/localematcher.h | 6 +- thirdparty/icu4c/common/unicode/rbbi.h | 20 +++ thirdparty/icu4c/common/unicode/ubrk.h | 11 +- thirdparty/icu4c/common/unicode/ucnv.h | 26 +++- thirdparty/icu4c/common/unicode/uniset.h | 4 +- thirdparty/icu4c/common/unicode/urename.h | 5 + thirdparty/icu4c/common/unicode/uset.h | 18 +-- thirdparty/icu4c/common/unicode/uvernum.h | 10 +- thirdparty/icu4c/common/unistr.cpp | 8 +- thirdparty/icu4c/common/ustrcase.cpp | 130 +++++++++++++++---- thirdparty/icu4c/common/uvector.cpp | 43 +----- thirdparty/icu4c/common/uvector.h | 12 -- thirdparty/icu4c/common/uvectr32.cpp | 2 +- thirdparty/icu4c/common/uvectr32.h | 6 +- thirdparty/icu4c/icudt70l.dat | Bin 4141152 -> 0 bytes thirdparty/icu4c/icudt71l.dat | Bin 0 -> 4226000 bytes 39 files changed, 672 insertions(+), 298 deletions(-) delete mode 100644 thirdparty/icu4c/icudt70l.dat create mode 100644 thirdparty/icu4c/icudt71l.dat (limited to 'thirdparty/icu4c') diff --git a/thirdparty/icu4c/LICENSE b/thirdparty/icu4c/LICENSE index 970ae074cb..80b587723a 100644 --- a/thirdparty/icu4c/LICENSE +++ b/thirdparty/icu4c/LICENSE @@ -1,6 +1,19 @@ -COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE -Copyright © 1991-2020 Unicode, Inc. All rights reserved. +See Terms of Use +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in https://www.unicode.org/copyright.html. Permission is hereby granted, free of charge, to any person obtaining @@ -32,7 +45,7 @@ shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder. ---------------------- +---------------------------------------------------------------------- Third-Party Software Licenses @@ -40,7 +53,9 @@ This section contains third-party software notices and/or additional terms for licensed third-party software components included within ICU libraries. -1. ICU License - ICU 1.8.1 to ICU 57.1 +---------------------------------------------------------------------- + +ICU License - ICU 1.8.1 to ICU 57.1 COPYRIGHT AND PERMISSION NOTICE @@ -75,7 +90,9 @@ of the copyright holder. All trademarks and registered trademarks mentioned herein are the property of their respective owners. -2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt) +---------------------------------------------------------------------- + +Chinese/Japanese Word Break Dictionary Data (cjdict.txt) # The Google Chrome software developed by Google is licensed under # the BSD license. Other software included in this distribution is @@ -279,7 +296,9 @@ property of their respective owners. # # ---------------COPYING.ipadic-----END---------------------------------- -3. Lao Word Break Dictionary Data (laodict.txt) +---------------------------------------------------------------------- + +Lao Word Break Dictionary Data (laodict.txt) # Copyright (C) 2016 and later: Unicode, Inc. and others. # License & terms of use: http://www.unicode.org/copyright.html @@ -319,7 +338,9 @@ property of their respective owners. # OF THE POSSIBILITY OF SUCH DAMAGE. # -------------------------------------------------------------------------- -4. Burmese Word Break Dictionary Data (burmesedict.txt) +---------------------------------------------------------------------- + +Burmese Word Break Dictionary Data (burmesedict.txt) # Copyright (c) 2014 International Business Machines Corporation # and others. All Rights Reserved. @@ -359,7 +380,9 @@ property of their respective owners. # SUCH DAMAGE. # -------------------------------------------------------------------------- -5. Time Zone Database +---------------------------------------------------------------------- + +Time Zone Database ICU uses the public domain data and code derived from Time Zone Database for its time zone support. The ownership of the TZ database @@ -382,7 +405,9 @@ Database section 7. # making a contribution to the database or code waives all rights to # future claims in that contribution or in the TZ Database. -6. Google double-conversion +---------------------------------------------------------------------- + +Google double-conversion Copyright 2006-2011, the V8 project authors. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -410,3 +435,85 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +File: aclocal.m4 (only for ICU4C) +Section: pkg.m4 - Macros to locate and utilise pkg-config. + + +Copyright © 2004 Scott James Remnant . +Copyright © 2012-2015 Dan Nicholson + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. + +As a special exception to the GNU General Public License, if you +distribute this file as part of a program that contains a +configuration script generated by Autoconf, you may include it under +the same distribution terms that you use for the rest of that +program. + + +(The condition for the exception is fulfilled because +ICU4C includes a configuration script generated by Autoconf, +namely the `configure` script.) + +---------------------------------------------------------------------- + +File: config.guess (only for ICU4C) + + +This file is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, see . + +As a special exception to the GNU General Public License, if you +distribute this file as part of a program that contains a +configuration script generated by Autoconf, you may include it under +the same distribution terms that you use for the rest of that +program. This Exception is an additional permission under section 7 +of the GNU General Public License, version 3 ("GPLv3"). + + +(The condition for the exception is fulfilled because +ICU4C includes a configuration script generated by Autoconf, +namely the `configure` script.) + +---------------------------------------------------------------------- + +File: install-sh (only for ICU4C) + + +Copyright 1991 by the Massachusetts Institute of Technology + +Permission to use, copy, modify, distribute, and sell this software and its +documentation for any purpose is hereby granted without fee, provided that +the above copyright notice appear in all copies and that both that +copyright notice and this permission notice appear in supporting +documentation, and that the name of M.I.T. not be used in advertising or +publicity pertaining to distribution of the software without specific, +written prior permission. M.I.T. makes no representations about the +suitability of this software for any purpose. It is provided "as is" +without express or implied warranty. diff --git a/thirdparty/icu4c/common/brkeng.cpp b/thirdparty/icu4c/common/brkeng.cpp index 52e9c53621..dc9fb99bf1 100644 --- a/thirdparty/icu4c/common/brkeng.cpp +++ b/thirdparty/icu4c/common/brkeng.cpp @@ -79,6 +79,7 @@ UnhandledEngine::findBreaks( UText *text, int32_t /* startPos */, int32_t endPos, UVector32 &/*foundBreaks*/, + UBool /* isPhraseBreaking */, UErrorCode &status) const { if (U_FAILURE(status)) return 0; UChar32 c = utext_current32(text); diff --git a/thirdparty/icu4c/common/brkeng.h b/thirdparty/icu4c/common/brkeng.h index 6843f1cc95..127ba59e18 100644 --- a/thirdparty/icu4c/common/brkeng.h +++ b/thirdparty/icu4c/common/brkeng.h @@ -75,6 +75,7 @@ class LanguageBreakEngine : public UMemory { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode &status) const = 0; }; @@ -194,6 +195,7 @@ class UnhandledEngine : public LanguageBreakEngine { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode &status) const override; /** diff --git a/thirdparty/icu4c/common/brkiter.cpp b/thirdparty/icu4c/common/brkiter.cpp index 8b228acf2c..8a1915880e 100644 --- a/thirdparty/icu4c/common/brkiter.cpp +++ b/thirdparty/icu4c/common/brkiter.cpp @@ -30,6 +30,7 @@ #include "unicode/ures.h" #include "unicode/ustring.h" #include "unicode/filteredbrk.h" +#include "bytesinkutil.h" #include "ucln_cmn.h" #include "cstring.h" #include "umutex.h" @@ -115,7 +116,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st } // Create a RuleBasedBreakIterator - result = new RuleBasedBreakIterator(file, status); + result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != NULL, status); // If there is a result, set the valid locale and actual locale, and the kind if (U_SUCCESS(status) && result != NULL) { @@ -408,7 +409,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) if (U_FAILURE(status)) { return NULL; } - char lbType[kKeyValueLenMax]; BreakIterator *result = NULL; switch (kind) { @@ -428,18 +428,29 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status) break; case UBRK_LINE: { + char lb_lw[kKeyValueLenMax]; UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE); - uprv_strcpy(lbType, "line"); - char lbKeyValue[kKeyValueLenMax] = {0}; + uprv_strcpy(lb_lw, "line"); UErrorCode kvStatus = U_ZERO_ERROR; - int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus); - if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) { - uprv_strcat(lbType, "_"); - uprv_strcat(lbType, lbKeyValue); + CharString value; + CharStringByteSink valueSink(&value); + loc.getKeywordValue("lb", valueSink, kvStatus); + if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) { + uprv_strcat(lb_lw, "_"); + uprv_strcat(lb_lw, value.data()); } - result = BreakIterator::buildInstance(loc, lbType, status); + // lw=phrase is only supported in Japanese. + if (uprv_strcmp(loc.getLanguage(), "ja") == 0) { + value.clear(); + loc.getKeywordValue("lw", valueSink, kvStatus); + if (U_SUCCESS(kvStatus) && value == "phrase") { + uprv_strcat(lb_lw, "_"); + uprv_strcat(lb_lw, value.data()); + } + } + result = BreakIterator::buildInstance(loc, lb_lw, status); - UTRACE_DATA1(UTRACE_INFO, "lb=%s", lbKeyValue); + UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw); UTRACE_EXIT_STATUS(status); } break; diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp index 4d158e3226..4fdbdf2760 100644 --- a/thirdparty/icu4c/common/dictbe.cpp +++ b/thirdparty/icu4c/common/dictbe.cpp @@ -17,7 +17,10 @@ #include "dictbe.h" #include "unicode/uniset.h" #include "unicode/chariter.h" +#include "unicode/resbund.h" #include "unicode/ubrk.h" +#include "unicode/usetiter.h" +#include "ubrkimpl.h" #include "utracimp.h" #include "uvectr32.h" #include "uvector.h" @@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; (void)startPos; // TODO: remove this param? @@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text, } rangeStart = start; rangeEnd = current; - result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status); + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status); utext_setNativeIndex(text, current); return result; @@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai"); - fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); + UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fThaiWordSet); + setCharacters(thaiWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fThaiWordSet; + fEndWordSet = thaiWordSet; fEndWordSet.remove(0x0E31); // MAI HAN-AKAT fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK @@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; utext_setNativeIndex(text, rangeStart); @@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo"); - fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); + UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fLaoWordSet); + setCharacters(laoWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fLaoWordSet; + fEndWordSet = laoWordSet; fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) @@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { @@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr"); - fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); + fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels + fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.add(0x0020); if (U_SUCCESS(status)) { - setCharacters(fBurmeseWordSet); + setCharacters(fEndWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); - fMarkSet.add(0x0020); - fEndWordSet = fBurmeseWordSet; - fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching. fMarkSet.compact(); @@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { @@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); + UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fKhmerWordSet); + setCharacters(khmerWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fKhmerWordSet; + fEndWordSet = khmerWordSet; fBeginWordSet.add(0x1780, 0x17B3); //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word @@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { @@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType : DictionaryBreakEngine(), fDictionary(adoptDictionary) { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani"); - // Korean dictionary only includes Hangul syllables - fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); - fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); - fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); - fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); nfkcNorm2 = Normalizer2::getNFKCInstance(status); - - if (U_SUCCESS(status)) { - // handle Korean and Japanese/Chinese using different dictionaries - if (type == kKorean) { + // Korean dictionary only includes Hangul syllables + fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status); + fHangulWordSet.compact(); + // Digits, open puncutation and Alphabetic characters. + fDigitOrOpenPunctuationOrAlphabetSet.applyPattern( + UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status); + fDigitOrOpenPunctuationOrAlphabetSet.compact(); + fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status); + fClosePunctuationSet.compact(); + + // handle Korean and Japanese/Chinese using different dictionaries + if (type == kKorean) { + if (U_SUCCESS(status)) { setCharacters(fHangulWordSet); - } else { //Chinese and Japanese - UnicodeSet cjSet; - cjSet.addAll(fHanWordSet); - cjSet.addAll(fKatakanaWordSet); - cjSet.addAll(fHiraganaWordSet); - cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK - cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK + } + } else { //Chinese and Japanese + UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status); + if (U_SUCCESS(status)) { setCharacters(cjSet); + initJapanesePhraseParameter(status); } } UTRACE_EXIT_STATUS(status); @@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) { (value >= 0xFF66 && value <= 0xFF9f); } - // Function for accessing internal utext flags. // Replicates an internal UText function. static inline int32_t utext_i32_flag(int32_t bitIndex) { return (int32_t)1 << bitIndex; } - /* * @param text A UText representing the text @@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if (rangeStart >= rangeEnd) { @@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { t_boundary.addElement(numCodePts, status); numBreaks++; + } else if (isPhraseBreaking) { + t_boundary.addElement(numCodePts, status); + if(U_SUCCESS(status)) { + numBreaks++; + int32_t prevIdx = numCodePts; + + int32_t codeUnitIdx = -1; + int32_t prevCodeUnitIdx = -1; + int32_t length = -1; + for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) { + codeUnitIdx = inString.moveIndex32(0, i); + prevCodeUnitIdx = inString.moveIndex32(0, prevIdx); + // Calculate the length by using the code unit. + length = prevCodeUnitIdx - codeUnitIdx; + prevIdx = i; + // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana + // characters don't occur. + if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length)) + && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1))) + || !isKatakana(inString.char32At(codeUnitIdx)))) { + t_boundary.addElement(i, status); + numBreaks++; + } + } + } } else { for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { t_boundary.addElement(i, status); @@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, // while reversing t_boundary and pushing values to foundBreaks. int32_t prevCPPos = -1; int32_t prevUTextPos = -1; - for (int32_t i = numBreaks-1; i >= 0; i--) { + int32_t correctedNumBreaks = 0; + for (int32_t i = numBreaks - 1; i >= 0; i--) { int32_t cpPos = t_boundary.elementAti(i); U_ASSERT(cpPos > prevCPPos); int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; @@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if (utextPos > prevUTextPos) { // Boundaries are added to foundBreaks output in ascending order. U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); - foundBreaks.push(utextPos, status); + // In phrase breaking, there has to be a breakpoint between Cj character and close + // punctuation. + // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 + if (utextPos != rangeStart + || (isPhraseBreaking && utextPos > 0 + && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) { + foundBreaks.push(utextPos, status); + correctedNumBreaks++; + } } else { // Normalization expanded the input text, the dictionary found a boundary // within the expansion, giving two boundaries with the same index in the @@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } (void)prevCPPos; // suppress compiler warnings about unused variable + UChar32 nextChar = utext_char32At(inText, rangeEnd); + if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { + // In phrase breaking, there has to be a breakpoint between Cj character and + // the number/open punctuation. + // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 + // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 + // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U + if (isPhraseBreaking) { + if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) { + foundBreaks.popi(); + correctedNumBreaks--; + } + } else { + foundBreaks.popi(); + correctedNumBreaks--; + } + } + // inString goes out of scope // inputMap goes out of scope - return numBreaks; + return correctedNumBreaks; +} + +void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) { + loadJapaneseExtensions(error); + loadHiragana(error); +} + +void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) { + const char* tag = "extensions"; + ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error); + if (U_SUCCESS(error)) { + ResourceBundle bundle = ja.get(tag, error); + while (U_SUCCESS(error) && bundle.hasNext()) { + fSkipSet.puti(bundle.getNextString(error), 1, error); + } + } +} + +void CjkBreakEngine::loadHiragana(UErrorCode& error) { + UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error); + hiraganaWordSet.compact(); + UnicodeSetIterator iterator(hiraganaWordSet); + while (iterator.next()) { + fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error); + } } #endif diff --git a/thirdparty/icu4c/common/dictbe.h b/thirdparty/icu4c/common/dictbe.h index 4e70ed3817..ca1a3c28b7 100644 --- a/thirdparty/icu4c/common/dictbe.h +++ b/thirdparty/icu4c/common/dictbe.h @@ -15,6 +15,7 @@ #include "unicode/utext.h" #include "brkeng.h" +#include "hash.h" #include "uvectr32.h" U_NAMESPACE_BEGIN @@ -80,6 +81,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status ) const override; protected: @@ -105,6 +107,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const = 0; }; @@ -127,7 +130,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fThaiWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fSuffixSet; @@ -164,6 +166,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -186,7 +189,6 @@ class LaoBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fLaoWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -222,6 +224,7 @@ class LaoBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -244,7 +247,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fBurmeseWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -280,6 +282,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -302,7 +305,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine { * @internal */ - UnicodeSet fKhmerWordSet; UnicodeSet fEndWordSet; UnicodeSet fBeginWordSet; UnicodeSet fMarkSet; @@ -338,6 +340,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; @@ -366,13 +369,22 @@ class CjkBreakEngine : public DictionaryBreakEngine { * @internal */ UnicodeSet fHangulWordSet; - UnicodeSet fHanWordSet; - UnicodeSet fKatakanaWordSet; - UnicodeSet fHiraganaWordSet; + UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; + UnicodeSet fClosePunctuationSet; DictionaryMatcher *fDictionary; const Normalizer2 *nfkcNorm2; + private: + // Load Japanese extensions. + void loadJapaneseExtensions(UErrorCode& error); + // Load Japanese Hiragana. + void loadHiragana(UErrorCode& error); + // Initialize fSkipSet by loading Japanese Hiragana and extensions. + void initJapanesePhraseParameter(UErrorCode& error); + + Hashtable fSkipSet; + public: /** @@ -404,6 +416,7 @@ class CjkBreakEngine : public DictionaryBreakEngine { int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; }; diff --git a/thirdparty/icu4c/common/localematcher.cpp b/thirdparty/icu4c/common/localematcher.cpp index 3d178dfbaf..2cad708d99 100644 --- a/thirdparty/icu4c/common/localematcher.cpp +++ b/thirdparty/icu4c/common/localematcher.cpp @@ -168,12 +168,9 @@ void LocaleMatcher::Builder::clearSupportedLocales() { bool LocaleMatcher::Builder::ensureSupportedLocaleVector() { if (U_FAILURE(errorCode_)) { return false; } if (supportedLocales_ != nullptr) { return true; } - supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_); + LocalPointer lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_); if (U_FAILURE(errorCode_)) { return false; } - if (supportedLocales_ == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - return false; - } + supportedLocales_ = lpSupportedLocales.orphan(); return true; } @@ -187,9 +184,8 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin for (int32_t i = 0; i < length; ++i) { Locale *locale = list.orphanLocaleAt(i); if (locale == nullptr) { continue; } - supportedLocales_->addElementX(locale, errorCode_); + supportedLocales_->adoptElement(locale, errorCode_); if (U_FAILURE(errorCode_)) { - delete locale; break; } } @@ -197,35 +193,21 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin } LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) { - if (U_FAILURE(errorCode_)) { return *this; } - clearSupportedLocales(); - if (!ensureSupportedLocaleVector()) { return *this; } - while (locales.hasNext()) { - const Locale &locale = locales.next(); - Locale *clone = locale.clone(); - if (clone == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - break; - } - supportedLocales_->addElementX(clone, errorCode_); - if (U_FAILURE(errorCode_)) { - delete clone; - break; + if (ensureSupportedLocaleVector()) { + clearSupportedLocales(); + while (locales.hasNext() && U_SUCCESS(errorCode_)) { + const Locale &locale = locales.next(); + LocalPointer clone (locale.clone(), errorCode_); + supportedLocales_->adoptElement(clone.orphan(), errorCode_); } } return *this; } LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) { - if (!ensureSupportedLocaleVector()) { return *this; } - Locale *clone = locale.clone(); - if (clone == nullptr) { - errorCode_ = U_MEMORY_ALLOCATION_ERROR; - return *this; - } - supportedLocales_->addElementX(clone, errorCode_); - if (U_FAILURE(errorCode_)) { - delete clone; + if (ensureSupportedLocaleVector()) { + LocalPointer clone(locale.clone(), errorCode_); + supportedLocales_->adoptElement(clone.orphan(), errorCode_); } return *this; } diff --git a/thirdparty/icu4c/common/locid.cpp b/thirdparty/icu4c/common/locid.cpp index e8859c7048..73bb8d8aec 100644 --- a/thirdparty/icu4c/common/locid.cpp +++ b/thirdparty/icu4c/common/locid.cpp @@ -1204,14 +1204,11 @@ AliasReplacer::parseLanguageReplacement( // We have multiple field so we have to allocate and parse CharString* str = new CharString( replacement, (int32_t)uprv_strlen(replacement), status); + LocalPointer lpStr(str, status); + toBeFreed.adoptElement(lpStr.orphan(), status); if (U_FAILURE(status)) { return; } - if (str == nullptr) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - toBeFreed.addElementX(str, status); char* data = str->data(); replacedLanguage = (const char*) data; char* endOfField = uprv_strchr(data, '_'); @@ -1420,12 +1417,9 @@ AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status) (int32_t)(firstSpace - replacement), status), status); } if (U_FAILURE(status)) { return false; } - if (item.isNull()) { - status = U_MEMORY_ALLOCATION_ERROR; - return false; - } replacedRegion = item->data(); - toBeFreed.addElementX(item.orphan(), status); + toBeFreed.adoptElement(item.orphan(), status); + if (U_FAILURE(status)) { return false; } } U_ASSERT(!same(region, replacedRegion)); region = replacedRegion; @@ -1659,10 +1653,10 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr && U_SUCCESS(status)) { *end = NULL_CHAR; // null terminate inside variantsBuff - variants.addElementX(start, status); + variants.addElement(start, status); start = end + 1; } - variants.addElementX(start, status); + variants.addElement(start, status); } if (U_FAILURE(status)) { return false; } diff --git a/thirdparty/icu4c/common/lstmbe.cpp b/thirdparty/icu4c/common/lstmbe.cpp index 3793abceb3..f6114cdfe2 100644 --- a/thirdparty/icu4c/common/lstmbe.cpp +++ b/thirdparty/icu4c/common/lstmbe.cpp @@ -1,8 +1,8 @@ // © 2021 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html +#include #include -#include #include "unicode/utypes.h" @@ -639,6 +639,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; int32_t beginFoundBreakSize = foundBreaks.size(); diff --git a/thirdparty/icu4c/common/lstmbe.h b/thirdparty/icu4c/common/lstmbe.h index c3f7ecf815..ffdf805eca 100644 --- a/thirdparty/icu4c/common/lstmbe.h +++ b/thirdparty/icu4c/common/lstmbe.h @@ -62,6 +62,7 @@ protected: int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const override; private: const LSTMData* fData; diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp index 5bfd49e8cb..e6bd75e717 100644 --- a/thirdparty/icu4c/common/normalizer2impl.cpp +++ b/thirdparty/icu4c/common/normalizer2impl.cpp @@ -2496,15 +2496,18 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode // origin is not the first character, or it is U+0000. UnicodeSet *set; if((canonValue&CANON_HAS_SET)==0) { - set=new UnicodeSet; - if(set==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; + LocalPointer lpSet(new UnicodeSet, errorCode); + set=lpSet.getAlias(); + if(U_FAILURE(errorCode)) { return; } UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode); - canonStartSets.addElementX(set, errorCode); + canonStartSets.adoptElement(lpSet.orphan(), errorCode); + if (U_FAILURE(errorCode)) { + return; + } if(firstOrigin!=0) { set->add(firstOrigin); } diff --git a/thirdparty/icu4c/common/rbbi.cpp b/thirdparty/icu4c/common/rbbi.cpp index f65177f232..cae8d154b3 100644 --- a/thirdparty/icu4c/common/rbbi.cpp +++ b/thirdparty/icu4c/common/rbbi.cpp @@ -82,6 +82,19 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode } } +//------------------------------------------------------------------------------- +// +// Constructor from a UDataMemory handle to precompiled break rules +// stored in an ICU data file. This construcotr is private API, +// only for internal use. +// +//------------------------------------------------------------------------------- +RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking, + UErrorCode &status) : RuleBasedBreakIterator(udm, status) +{ + fIsPhraseBreaking = isPhraseBreaking; +} + // // Construct from precompiled binary rules (tables). This constructor is public API, // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). @@ -322,6 +335,7 @@ void RuleBasedBreakIterator::init(UErrorCode &status) { fBreakCache = nullptr; fDictionaryCache = nullptr; fLookAheadMatches = nullptr; + fIsPhraseBreaking = false; // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. // fText = UTEXT_INITIALIZER; diff --git a/thirdparty/icu4c/common/rbbi_cache.cpp b/thirdparty/icu4c/common/rbbi_cache.cpp index 6bfe3feca4..26d82df781 100644 --- a/thirdparty/icu4c/common/rbbi_cache.cpp +++ b/thirdparty/icu4c/common/rbbi_cache.cpp @@ -163,7 +163,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo // Ask the language object if there are any breaks. It will add them to the cache and // leave the text pointer on the other side of its range, ready to search for the next one. if (lbe != NULL) { - foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, status); + foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status); } // Reload the loop variables for the next go-round diff --git a/thirdparty/icu4c/common/serv.cpp b/thirdparty/icu4c/common/serv.cpp index 0c54a4dce9..c26dbca1a9 100644 --- a/thirdparty/icu4c/common/serv.cpp +++ b/thirdparty/icu4c/common/serv.cpp @@ -625,10 +625,7 @@ ICUService::getVisibleIDs(UVector& result, const UnicodeString* matchID, UErrorC } } - LocalPointer idClone(new UnicodeString(*id), status); - if (U_SUCCESS(status) && idClone->isBogus()) { - status = U_MEMORY_ALLOCATION_ERROR; - } + LocalPointer idClone(id->clone(), status); result.adoptElement(idClone.orphan(), status); } delete fallbackKey; diff --git a/thirdparty/icu4c/common/servls.cpp b/thirdparty/icu4c/common/servls.cpp index 7108afd4a5..98f0a8a12b 100644 --- a/thirdparty/icu4c/common/servls.cpp +++ b/thirdparty/icu4c/common/servls.cpp @@ -179,7 +179,8 @@ private: length = other._ids.size(); for(i = 0; i < length; ++i) { - _ids.addElementX(((UnicodeString *)other._ids.elementAt(i))->clone(), status); + LocalPointer clonedId(((UnicodeString *)other._ids.elementAt(i))->clone(), status); + _ids.adoptElement(clonedId.orphan(), status); } if(U_SUCCESS(status)) { diff --git a/thirdparty/icu4c/common/servnotf.cpp b/thirdparty/icu4c/common/servnotf.cpp index 342e0d9f24..d9fb388752 100644 --- a/thirdparty/icu4c/common/servnotf.cpp +++ b/thirdparty/icu4c/common/servnotf.cpp @@ -49,7 +49,11 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status) if (acceptsListener(*l)) { Mutex lmx(¬ifyLock); if (listeners == NULL) { - listeners = new UVector(5, status); + LocalPointer lpListeners(new UVector(5, status), status); + if (U_FAILURE(status)) { + return; + } + listeners = lpListeners.orphan(); } else { for (int i = 0, e = listeners->size(); i < e; ++i) { const EventListener* el = (const EventListener*)(listeners->elementAt(i)); @@ -59,7 +63,7 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status) } } - listeners->addElementX((void*)l, status); // cast away const + listeners->addElement((void*)l, status); // cast away const } #ifdef NOTIFIER_DEBUG else { @@ -102,13 +106,11 @@ ICUNotifier::removeListener(const EventListener *l, UErrorCode& status) void ICUNotifier::notifyChanged(void) { + Mutex lmx(¬ifyLock); if (listeners != NULL) { - Mutex lmx(¬ifyLock); - if (listeners != NULL) { - for (int i = 0, e = listeners->size(); i < e; ++i) { - EventListener* el = (EventListener*)listeners->elementAt(i); - notifyListener(*el); - } + for (int i = 0, e = listeners->size(); i < e; ++i) { + EventListener* el = (EventListener*)listeners->elementAt(i); + notifyListener(*el); } } } diff --git a/thirdparty/icu4c/common/ubrk.cpp b/thirdparty/icu4c/common/ubrk.cpp index bb5bdd1b50..f4e064961f 100644 --- a/thirdparty/icu4c/common/ubrk.cpp +++ b/thirdparty/icu4c/common/ubrk.cpp @@ -168,7 +168,7 @@ ubrk_safeClone( BreakIterator *newBI = ((BreakIterator *)bi)->clone(); if (newBI == NULL) { *status = U_MEMORY_ALLOCATION_ERROR; - } else { + } else if (pBufferSize != NULL) { *status = U_SAFECLONE_ALLOCATED_WARNING; } return (UBreakIterator *)newBI; @@ -176,15 +176,7 @@ ubrk_safeClone( U_CAPI UBreakIterator * U_EXPORT2 ubrk_clone(const UBreakIterator *bi, UErrorCode *status) { - if (U_FAILURE(*status)) { - return nullptr; - } - BreakIterator *newBI = ((BreakIterator *)bi)->clone(); - if (newBI == nullptr) { - *status = U_MEMORY_ALLOCATION_ERROR; - return nullptr; - } - return (UBreakIterator *)newBI; + return ubrk_safeClone(bi, nullptr, nullptr, status); } diff --git a/thirdparty/icu4c/common/ucase.cpp b/thirdparty/icu4c/common/ucase.cpp index 4aa856507a..388c86b1bb 100644 --- a/thirdparty/icu4c/common/ucase.cpp +++ b/thirdparty/icu4c/common/ucase.cpp @@ -22,27 +22,14 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/uset.h" -#include "unicode/udata.h" /* UDataInfo */ #include "unicode/utf16.h" -#include "ucmndata.h" /* DataHeader */ -#include "udatamem.h" -#include "umutex.h" -#include "uassert.h" #include "cmemory.h" -#include "utrie2.h" +#include "uassert.h" #include "ucase.h" +#include "umutex.h" +#include "utrie2.h" -struct UCaseProps { - UDataMemory *mem; - const int32_t *indexes; - const uint16_t *exceptions; - const uint16_t *unfold; - - UTrie2 trie; - uint8_t formatVersion[4]; -}; - -/* ucase_props_data.h is machine-generated by gencase --csource */ +/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */ #define INCLUDED_FROM_UCASE_CPP #include "ucase_props_data.h" @@ -77,6 +64,13 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { /* data access primitives --------------------------------------------------- */ +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) { + *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions); + *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold); + return &ucase_props_singleton; +} + U_CFUNC const UTrie2 * U_EXPORT2 ucase_getTrie() { return &ucase_props_singleton.trie; @@ -690,7 +684,7 @@ ucase_isCaseSensitive(UChar32 c) { * - The general category of C is * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or * Letter Modifier (Lm), or Symbol Modifier (Sk) - * - C is one of the following characters + * - C is one of the following characters * U+0027 APOSTROPHE * U+00AD SOFT HYPHEN (SHY) * U+2019 RIGHT SINGLE QUOTATION MARK @@ -1064,6 +1058,8 @@ ucase_toFullLower(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_IS_UPPER_OR_TITLE(props)) { @@ -1148,7 +1144,6 @@ ucase_toFullLower(UChar32 c, 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE */ - *pString=nullptr; return 0; /* remove the dot (continue without output) */ } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) { /* @@ -1215,6 +1210,8 @@ toUpperOrTitle(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)==UCASE_LOWER) { @@ -1252,7 +1249,6 @@ toUpperOrTitle(UChar32 c, 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE */ - *pString=nullptr; return 0; /* remove the dot (continue without output) */ } else if(c==0x0587) { // See ICU-13416: @@ -1449,6 +1445,8 @@ ucase_toFullFolding(UChar32 c, // The sign of the result has meaning, input must be non-negative so that it can be returned as is. U_ASSERT(c >= 0); UChar32 result=c; + // Reset the output pointer in case it was uninitialized. + *pString=nullptr; uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); if(!UCASE_HAS_EXCEPTION(props)) { if(UCASE_IS_UPPER_OR_TITLE(props)) { @@ -1542,7 +1540,7 @@ U_CAPI UChar32 U_EXPORT2 u_tolower(UChar32 c) { return ucase_tolower(c); } - + /* Transforms the Unicode character to its upper case equivalent.*/ U_CAPI UChar32 U_EXPORT2 u_toupper(UChar32 c) { diff --git a/thirdparty/icu4c/common/ucase.h b/thirdparty/icu4c/common/ucase.h index a018f82b81..7bf57fd370 100644 --- a/thirdparty/icu4c/common/ucase.h +++ b/thirdparty/icu4c/common/ucase.h @@ -312,6 +312,21 @@ UCaseMapFull(UChar32 c, U_CDECL_END +/* for icuexportdata -------------------------------------------------------- */ + +struct UCaseProps { + void *mem; // TODO: was unused, and type UDataMemory -- remove + const int32_t *indexes; + const uint16_t *exceptions; + const uint16_t *unfold; + + UTrie2 trie; + uint8_t formatVersion[4]; +}; + +U_CAPI const struct UCaseProps * U_EXPORT2 +ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength); + /* file definitions --------------------------------------------------------- */ #define UCASE_DATA_NAME "ucase" diff --git a/thirdparty/icu4c/common/ucasemap.cpp b/thirdparty/icu4c/common/ucasemap.cpp index ed72bda828..95b55d56a0 100644 --- a/thirdparty/icu4c/common/ucasemap.cpp +++ b/thirdparty/icu4c/common/ucasemap.cpp @@ -112,8 +112,7 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { if(length==sizeof(csm->locale)) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } - if(U_SUCCESS(*pErrorCode)) { - csm->caseLocale=UCASE_LOC_UNKNOWN; + if(U_SUCCESS(*pErrorCode)) { csm->caseLocale = ucase_getCaseLocale(csm->locale); } else { csm->locale[0]=0; @@ -420,6 +419,97 @@ void toUpper(int32_t caseLocale, uint32_t options, #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0]; + +constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1]; + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit, + ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) { + U_ASSERT(start < segmentLimit); + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar32 c2; + c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) { + withAcute = true; + unchanged1 = 2; // ACUTE is 2 code units in UTF-8 + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) { + return start; + } + if (doTitleJ) { + unchanged2 = 2; // ACUTE is 2 code units in UTF-8 + } else { + unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8 + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U8_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode); + start += unchanged1; + if (doTitleJ) { + ByteSinkUtil::appendCodePoint(1, u'J', sink, edits); + ++start; + } + ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC void U_CALLCONV ucasemap_internalUTF8ToTitle( int32_t caseLocale, uint32_t options, BreakIterator *iter, @@ -504,19 +594,14 @@ ucasemap_internalUTF8ToTitle( } /* Special case Dutch IJ titlecasing */ - if (titleStart+1 < index && - caseLocale == UCASE_LOC_DUTCH && - (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { - if (src[titleStart+1] == 0x006A) { - ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits); - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1, - sink, options, edits, errorCode)) { - return; - } - titleLimit++; + if (titleLimit < index && + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode); } } diff --git a/thirdparty/icu4c/common/ucnv.cpp b/thirdparty/icu4c/common/ucnv.cpp index 5dcf35e043..019bcb6a79 100644 --- a/thirdparty/icu4c/common/ucnv.cpp +++ b/thirdparty/icu4c/common/ucnv.cpp @@ -252,7 +252,10 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U UTRACE_EXIT_STATUS(*status); return NULL; } - *status = U_SAFECLONE_ALLOCATED_WARNING; + // If pBufferSize was NULL as the input, pBufferSize is set to &stackBufferSize in this function. + if (pBufferSize != &stackBufferSize) { + *status = U_SAFECLONE_ALLOCATED_WARNING; + } /* record the fact that memory was allocated */ *pBufferSize = bufferSizeNeeded; @@ -317,7 +320,11 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U return localConverter; } - +U_CAPI UConverter* U_EXPORT2 +ucnv_clone(const UConverter* cnv, UErrorCode *status) +{ + return ucnv_safeClone(cnv, nullptr, nullptr, status); +} /*Decreases the reference counter in the shared immutable section of the object *and frees the mutable part*/ diff --git a/thirdparty/icu4c/common/ucurr.cpp b/thirdparty/icu4c/common/ucurr.cpp index 67aab4e8ff..6e489e0563 100644 --- a/thirdparty/icu4c/common/ucurr.cpp +++ b/thirdparty/icu4c/common/ucurr.cpp @@ -254,7 +254,7 @@ currSymbolsEquiv_cleanup(void) } /** - * Deleter for OlsonToMetaMappingEntry + * Deleter for IsoCodeEntry */ static void U_CALLCONV deleteIsoCodeEntry(void *obj) { diff --git a/thirdparty/icu4c/common/uloc.cpp b/thirdparty/icu4c/common/uloc.cpp index c8a3f1ff73..99c6a0af39 100644 --- a/thirdparty/icu4c/common/uloc.cpp +++ b/thirdparty/icu4c/common/uloc.cpp @@ -186,10 +186,10 @@ NULL }; static const char* const DEPRECATED_LANGUAGES[]={ - "in", "iw", "ji", "jw", NULL, NULL + "in", "iw", "ji", "jw", "mo", NULL, NULL }; static const char* const REPLACEMENT_LANGUAGES[]={ - "id", "he", "yi", "jv", NULL, NULL + "id", "he", "yi", "jv", "ro", NULL, NULL }; /** @@ -444,7 +444,7 @@ static const char * const COUNTRIES_3[] = { /* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */ "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF", /* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */ - "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE", + "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE", NULL, /* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */ "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR", diff --git a/thirdparty/icu4c/common/unicode/localematcher.h b/thirdparty/icu4c/common/unicode/localematcher.h index 252bb7fdc2..0f7e04a3af 100644 --- a/thirdparty/icu4c/common/unicode/localematcher.h +++ b/thirdparty/icu4c/common/unicode/localematcher.h @@ -461,13 +461,13 @@ public: * Option for whether to include or ignore one-way (fallback) match data. * By default, they are included. * - * @param direction the match direction to set. + * @param matchDirection the match direction to set. * @return this Builder object * @stable ICU 67 */ - Builder &setDirection(ULocMatchDirection direction) { + Builder &setDirection(ULocMatchDirection matchDirection) { if (U_SUCCESS(errorCode_)) { - direction_ = direction; + direction_ = matchDirection; } return *this; } diff --git a/thirdparty/icu4c/common/unicode/rbbi.h b/thirdparty/icu4c/common/unicode/rbbi.h index 0ce93819f5..0bad0d3897 100644 --- a/thirdparty/icu4c/common/unicode/rbbi.h +++ b/thirdparty/icu4c/common/unicode/rbbi.h @@ -147,6 +147,11 @@ private: */ int32_t *fLookAheadMatches; + /** + * A flag to indicate if phrase based breaking is enabled. + */ + UBool fIsPhraseBreaking; + //======================================================================= // constructors //======================================================================= @@ -163,6 +168,21 @@ private: */ RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); + /** + * This constructor uses the udata interface to create a BreakIterator + * whose internal tables live in a memory-mapped file. "image" is an + * ICU UDataMemory handle for the pre-compiled break iterator tables. + * @param image handle to the memory image for the break iterator data. + * Ownership of the UDataMemory handle passes to the Break Iterator, + * which will be responsible for closing it when it is no longer needed. + * @param status Information on any errors encountered. + * @param isPhraseBreaking true if phrase based breaking is required, otherwise false. + * @see udata_open + * @see #getBinaryRules + * @internal (private) + */ + RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status); + /** @internal */ friend class RBBIRuleBuilder; /** @internal */ diff --git a/thirdparty/icu4c/common/unicode/ubrk.h b/thirdparty/icu4c/common/unicode/ubrk.h index c603f7c13f..2b3dc7aa57 100644 --- a/thirdparty/icu4c/common/unicode/ubrk.h +++ b/thirdparty/icu4c/common/unicode/ubrk.h @@ -312,11 +312,12 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength, * If *pBufferSize is not enough for a stack-based safe clone, * new memory will be allocated. * @param status to indicate whether the operation went on smoothly or there were errors - * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. + * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used + * if pBufferSize != NULL and any allocations were necessary * @return pointer to the new clone * @deprecated ICU 69 Use ubrk_clone() instead. */ -U_CAPI UBreakIterator * U_EXPORT2 +U_DEPRECATED UBreakIterator * U_EXPORT2 ubrk_safeClone( const UBreakIterator *bi, void *stackBuffer, @@ -325,21 +326,17 @@ ubrk_safeClone( #endif /* U_HIDE_DEPRECATED_API */ -#ifndef U_HIDE_DRAFT_API - /** * Thread safe cloning operation. * @param bi iterator to be cloned * @param status to indicate whether the operation went on smoothly or there were errors * @return pointer to the new clone - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI UBreakIterator * U_EXPORT2 ubrk_clone(const UBreakIterator *bi, UErrorCode *status); -#endif // U_HIDE_DRAFT_API - #ifndef U_HIDE_DEPRECATED_API /** diff --git a/thirdparty/icu4c/common/unicode/ucnv.h b/thirdparty/icu4c/common/unicode/ucnv.h index 2687c984d4..20c173b662 100644 --- a/thirdparty/icu4c/common/unicode/ucnv.h +++ b/thirdparty/icu4c/common/unicode/ucnv.h @@ -477,7 +477,7 @@ ucnv_openCCSID(int32_t codepage, * *

The name will NOT be looked up in the alias mechanism, nor will the converter be * stored in the converter cache or the alias table. The only way to open further converters - * is call this function multiple times, or use the ucnv_safeClone() function to clone a + * is call this function multiple times, or use the ucnv_clone() function to clone a * 'primary' converter.

* *

A future version of ICU may add alias table lookups and/or caching @@ -493,13 +493,27 @@ ucnv_openCCSID(int32_t codepage, * @return the created Unicode converter object, or NULL if an error occurred * @see udata_open * @see ucnv_open - * @see ucnv_safeClone + * @see ucnv_clone * @see ucnv_close * @stable ICU 2.2 */ U_CAPI UConverter* U_EXPORT2 ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode *err); +/** + * Thread safe converter cloning operation. + * + * You must ucnv_close() the clone. + * + * @param cnv converter to be cloned + * @param status to indicate whether the operation went on smoothly or there were errors + * @return pointer to the new clone + * @stable ICU 71 + */ +U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *status); + +#ifndef U_HIDE_DEPRECATED_API + /** * Thread safe converter cloning operation. * For most efficient operation, pass in a stackBuffer (and a *pBufferSize) @@ -532,21 +546,19 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode * pointer to size of allocated space. * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_WARNING, - * is used if any allocations were necessary. + * is used if pBufferSize != NULL and any allocations were necessary * However, it is better to check if *pBufferSize grew for checking for * allocations because warning codes can be overridden by subsequent * function calls. * @return pointer to the new clone - * @stable ICU 2.0 + * @deprecated ICU 71 Use ucnv_clone() instead. */ -U_CAPI UConverter * U_EXPORT2 +U_DEPRECATED UConverter * U_EXPORT2 ucnv_safeClone(const UConverter *cnv, void *stackBuffer, int32_t *pBufferSize, UErrorCode *status); -#ifndef U_HIDE_DEPRECATED_API - /** * \def U_CNV_SAFECLONE_BUFFERSIZE * Definition of a buffer size that is designed to be large enough for diff --git a/thirdparty/icu4c/common/unicode/uniset.h b/thirdparty/icu4c/common/unicode/uniset.h index 730337a353..310c7c8d20 100644 --- a/thirdparty/icu4c/common/unicode/uniset.h +++ b/thirdparty/icu4c/common/unicode/uniset.h @@ -1229,7 +1229,6 @@ public: */ UnicodeSet& retain(UChar32 c); -#ifndef U_HIDE_DRAFT_API /** * Retains only the specified string from this set if it is present. * Upon return this set will be empty if it did not contain s, or @@ -1238,10 +1237,9 @@ public: * * @param s the source string * @return this object, for chaining - * @draft ICU 69 + * @stable ICU 69 */ UnicodeSet& retain(const UnicodeString &s); -#endif // U_HIDE_DRAFT_API /** * Removes the specified range from this set if it is present. diff --git a/thirdparty/icu4c/common/unicode/urename.h b/thirdparty/icu4c/common/unicode/urename.h index 4605f632ea..d9f9b8f336 100644 --- a/thirdparty/icu4c/common/unicode/urename.h +++ b/thirdparty/icu4c/common/unicode/urename.h @@ -567,6 +567,7 @@ #define ucase_addStringCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addStringCaseClosure) #define ucase_fold U_ICU_ENTRY_POINT_RENAME(ucase_fold) #define ucase_getCaseLocale U_ICU_ENTRY_POINT_RENAME(ucase_getCaseLocale) +#define ucase_getSingleton U_ICU_ENTRY_POINT_RENAME(ucase_getSingleton) #define ucase_getTrie U_ICU_ENTRY_POINT_RENAME(ucase_getTrie) #define ucase_getType U_ICU_ENTRY_POINT_RENAME(ucase_getType) #define ucase_getTypeOrIgnorable U_ICU_ENTRY_POINT_RENAME(ucase_getTypeOrIgnorable) @@ -630,6 +631,7 @@ #define ucnv_cbFromUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteUChars) #define ucnv_cbToUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteSub) #define ucnv_cbToUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteUChars) +#define ucnv_clone U_ICU_ENTRY_POINT_RENAME(ucnv_clone) #define ucnv_close U_ICU_ENTRY_POINT_RENAME(ucnv_close) #define ucnv_compareNames U_ICU_ENTRY_POINT_RENAME(ucnv_compareNames) #define ucnv_convert U_ICU_ENTRY_POINT_RENAME(ucnv_convert) @@ -725,6 +727,7 @@ #define ucnvsel_selectForString U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForString) #define ucnvsel_selectForUTF8 U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForUTF8) #define ucnvsel_serialize U_ICU_ENTRY_POINT_RENAME(ucnvsel_serialize) +#define ucol_clone U_ICU_ENTRY_POINT_RENAME(ucol_clone) #define ucol_cloneBinary U_ICU_ENTRY_POINT_RENAME(ucol_cloneBinary) #define ucol_close U_ICU_ENTRY_POINT_RENAME(ucol_close) #define ucol_closeElements U_ICU_ENTRY_POINT_RENAME(ucol_closeElements) @@ -904,6 +907,7 @@ #define udatpg_getBestPattern U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPattern) #define udatpg_getBestPatternWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPatternWithOptions) #define udatpg_getDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormat) +#define udatpg_getDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormatForStyle) #define udatpg_getDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_getDecimal) #define udatpg_getDefaultHourCycle U_ICU_ENTRY_POINT_RENAME(udatpg_getDefaultHourCycle) #define udatpg_getFieldDisplayName U_ICU_ENTRY_POINT_RENAME(udatpg_getFieldDisplayName) @@ -918,6 +922,7 @@ #define udatpg_setAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemFormat) #define udatpg_setAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemName) #define udatpg_setDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormat) +#define udatpg_setDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormatForStyle) #define udatpg_setDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_setDecimal) #define udict_swap U_ICU_ENTRY_POINT_RENAME(udict_swap) #define udtitvfmt_close U_ICU_ENTRY_POINT_RENAME(udtitvfmt_close) diff --git a/thirdparty/icu4c/common/unicode/uset.h b/thirdparty/icu4c/common/unicode/uset.h index 2ef352ef56..33332f2d36 100644 --- a/thirdparty/icu4c/common/unicode/uset.h +++ b/thirdparty/icu4c/common/unicode/uset.h @@ -628,7 +628,6 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end); U_CAPI void U_EXPORT2 uset_removeString(USet* set, const UChar* str, int32_t strLen); -#ifndef U_HIDE_DRAFT_API /** * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} * A frozen set will not be modified. @@ -636,11 +635,10 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Removes from this set all of its elements that are contained in the @@ -671,7 +669,6 @@ uset_removeAll(USet* set, const USet* removeSet); U_CAPI void U_EXPORT2 uset_retain(USet* set, UChar32 start, UChar32 end); -#ifndef U_HIDE_DRAFT_API /** * Retains only the specified string from this set if it is present. * Upon return this set will be empty if it did not contain s, or @@ -681,7 +678,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainString(USet *set, const UChar *str, int32_t length); @@ -693,11 +690,10 @@ uset_retainString(USet *set, const UChar *str, int32_t length); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Retains only the elements in this set that are contained in the @@ -741,7 +737,6 @@ uset_compact(USet* set); U_CAPI void U_EXPORT2 uset_complement(USet* set); -#ifndef U_HIDE_DRAFT_API /** * Complements the specified range in this set. Any character in * the range will be removed if it is in this set, or will be @@ -753,7 +748,7 @@ uset_complement(USet* set); * @param set the object to be modified * @param start first character, inclusive, of range * @param end last character, inclusive, of range - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementRange(USet *set, UChar32 start, UChar32 end); @@ -766,7 +761,7 @@ uset_complementRange(USet *set, UChar32 start, UChar32 end); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementString(USet *set, const UChar *str, int32_t length); @@ -778,11 +773,10 @@ uset_complementString(USet *set, const UChar *str, int32_t length); * @param set the object to be modified * @param str the string * @param length the length of the string, or -1 if NUL-terminated - * @draft ICU 69 + * @stable ICU 69 */ U_CAPI void U_EXPORT2 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); -#endif // U_HIDE_DRAFT_API /** * Complements in this set all elements contained in the specified diff --git a/thirdparty/icu4c/common/unicode/uvernum.h b/thirdparty/icu4c/common/unicode/uvernum.h index 42e8865d7e..2706e0b060 100644 --- a/thirdparty/icu4c/common/unicode/uvernum.h +++ b/thirdparty/icu4c/common/unicode/uvernum.h @@ -60,7 +60,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.4 */ -#define U_ICU_VERSION_MAJOR_NUM 70 +#define U_ICU_VERSION_MAJOR_NUM 71 /** The current ICU minor version as an integer. * This value will change in the subsequent releases of ICU @@ -86,7 +86,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.6 */ -#define U_ICU_VERSION_SUFFIX _70 +#define U_ICU_VERSION_SUFFIX _71 /** * \def U_DEF2_ICU_ENTRY_POINT_RENAME @@ -139,7 +139,7 @@ * This value will change in the subsequent releases of ICU * @stable ICU 2.4 */ -#define U_ICU_VERSION "70.1" +#define U_ICU_VERSION "71.1" /** * The current ICU library major version number as a string, for library name suffixes. @@ -152,13 +152,13 @@ * * @stable ICU 2.6 */ -#define U_ICU_VERSION_SHORT "70" +#define U_ICU_VERSION_SHORT "71" #ifndef U_HIDE_INTERNAL_API /** Data version in ICU4C. * @internal ICU 4.4 Internal Use Only **/ -#define U_ICU_DATA_VERSION "70.1" +#define U_ICU_DATA_VERSION "71.1" #endif /* U_HIDE_INTERNAL_API */ /*=========================================================================== diff --git a/thirdparty/icu4c/common/unistr.cpp b/thirdparty/icu4c/common/unistr.cpp index 077b4d6ef2..c18665928d 100644 --- a/thirdparty/icu4c/common/unistr.cpp +++ b/thirdparty/icu4c/common/unistr.cpp @@ -334,7 +334,8 @@ Replaceable::clone() const { // UnicodeString overrides clone() with a real implementation UnicodeString * UnicodeString::clone() const { - return new UnicodeString(*this); + LocalPointer clonedString(new UnicodeString(*this)); + return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr; } //======================================== @@ -1976,7 +1977,12 @@ The vector deleting destructor is already a part of UObject, but defining it here makes sure that it is included with this object file. This makes sure that static library dependencies are kept to a minimum. */ +#if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" static void uprv_UnicodeStringDummy(void) { delete [] (new UnicodeString[2]); } +#pragma GCC diagnostic pop +#endif #endif diff --git a/thirdparty/icu4c/common/ustrcase.cpp b/thirdparty/icu4c/common/ustrcase.cpp index 36b19e75f2..43910ea520 100644 --- a/thirdparty/icu4c/common/ustrcase.cpp +++ b/thirdparty/icu4c/common/ustrcase.cpp @@ -36,6 +36,12 @@ #include "ustr_imp.h" #include "uassert.h" +/** + * Code point for COMBINING ACUTE ACCENT + * @internal + */ +#define ACUTE u'\u0301' + U_NAMESPACE_BEGIN namespace { @@ -396,6 +402,94 @@ U_NAMESPACE_USE #if !UCONFIG_NO_BREAK_ITERATION +namespace { + +/** + * Input: c is a letter I with or without acute accent. + * start is the index in src after c, and is less than segmentLimit. + * If a plain i/I is followed by a plain j/J, + * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute, + * then we output accordingly. + * + * @return the src index after the titlecased sequence, or the start index if no Dutch IJ + */ +int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit, + UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options, + icu::Edits *edits) { + U_ASSERT(start < segmentLimit); + + int32_t index = start; + bool withAcute = false; + + // If the conditions are met, then the following variables tell us what to output. + int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3) + bool doTitleJ = false; // true if the j needs to be titlecased + int32_t unchanged2 = 0; // after the j (0 or 1) + + // next character after the first letter + UChar c2 = src[index++]; + + // Is the first letter an i/I with accent? + if (c == u'I') { + if (c2 == ACUTE) { + withAcute = true; + unchanged1 = 1; + if (index == segmentLimit) { return start; } + c2 = src[index++]; + } + } else { // Í + withAcute = true; + } + + // Is the next character a j/J? + if (c2 == u'j') { + doTitleJ = true; + } else if (c2 == u'J') { + ++unchanged1; + } else { + return start; + } + + // A plain i/I must be followed by a plain j/J. + // An i/I with acute must be followed by a j/J with acute. + if (withAcute) { + if (index == segmentLimit || src[index++] != ACUTE) { return start; } + if (doTitleJ) { + unchanged2 = 1; + } else { + ++unchanged1; + } + } + + // There must not be another combining mark. + if (index < segmentLimit) { + int32_t cp; + int32_t i = index; + U16_NEXT(src, i, segmentLimit, cp); + uint32_t typeMask = U_GET_GC_MASK(cp); + if ((typeMask & U_GC_M_MASK) != 0) { + return start; + } + } + + // Output the rest of the Dutch IJ. + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits); + start += unchanged1; + if (doTitleJ) { + destIndex = appendUChar(dest, destIndex, destCapacity, u'J'); + if (edits != nullptr) { + edits->addReplace(1, 1); + } + ++start; + } + destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits); + + U_ASSERT(start + unchanged2 == index); + return index; +} + +} // namespace + U_CFUNC int32_t U_CALLCONV ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, UChar *dest, int32_t destCapacity, @@ -412,14 +506,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it csc.limit=srcLength; int32_t destIndex=0; int32_t prev=0; - UBool isFirstIndex=TRUE; + bool isFirstIndex=true; /* titlecasing loop */ while(prevfirst(); } else { index=iter->next(); @@ -446,7 +540,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it // Stop with titleStartaddReplace(1, 1); - } - titleLimit++; - } else if (src[titleStart+1] == 0x004A) { - // Keep the capital J from getting lowercased. - destIndex=appendUnchanged(dest, destIndex, destCapacity, - src+titleStart+1, 1, options, edits); - if(destIndex<0) { - errorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - titleLimit++; + caseLocale == UCASE_LOC_DUTCH) { + if (c < 0) { + c = ~c; + } + + if (c == u'I' || c == u'Í') { + titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, + dest, destIndex, destCapacity, options, + edits); } } diff --git a/thirdparty/icu4c/common/uvector.cpp b/thirdparty/icu4c/common/uvector.cpp index 4da8b864e1..844463921e 100644 --- a/thirdparty/icu4c/common/uvector.cpp +++ b/thirdparty/icu4c/common/uvector.cpp @@ -99,14 +99,6 @@ bool UVector::operator==(const UVector& other) const { return true; } -// TODO: delete this function once all call sites have been migrated to the -// new addElement(). -void UVector::addElementX(void* obj, UErrorCode &status) { - if (ensureCapacityX(count + 1, status)) { - elements[count++].pointer = obj; - } -} - void UVector::addElement(void* obj, UErrorCode &status) { U_ASSERT(deleter == nullptr); if (ensureCapacity(count + 1, status)) { @@ -331,38 +323,6 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const { return -1; } -UBool UVector::ensureCapacityX(int32_t minimumCapacity, UErrorCode &status) { - if (minimumCapacity < 0) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - if (capacity < minimumCapacity) { - if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - int32_t newCap = capacity * 2; - if (newCap < minimumCapacity) { - newCap = minimumCapacity; - } - if (newCap > (int32_t)(INT32_MAX / sizeof(UElement))) { // integer overflow check - // We keep the original memory contents on bad minimumCapacity. - status = U_ILLEGAL_ARGUMENT_ERROR; - return FALSE; - } - UElement* newElems = (UElement *)uprv_realloc(elements, sizeof(UElement)*newCap); - if (newElems == nullptr) { - // We keep the original contents on the memory failure on realloc or bad minimumCapacity. - status = U_MEMORY_ALLOCATION_ERROR; - return FALSE; - } - elements = newElems; - capacity = newCap; - } - return TRUE; -} - - UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { if (U_FAILURE(status)) { return false; @@ -370,7 +330,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { if (minimumCapacity < 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return false; - } + } if (capacity < minimumCapacity) { if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check status = U_ILLEGAL_ARGUMENT_ERROR; @@ -396,6 +356,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) { } return true; } + /** * Change the size of this vector as follows: If newSize is smaller, * then truncate the array, possibly deleting held elements for i >= diff --git a/thirdparty/icu4c/common/uvector.h b/thirdparty/icu4c/common/uvector.h index f61fcc2be6..1eb7d136e7 100644 --- a/thirdparty/icu4c/common/uvector.h +++ b/thirdparty/icu4c/common/uvector.h @@ -123,12 +123,6 @@ public: // java.util.Vector API //------------------------------------------------------------ - /* - * Old version of addElement, with non-standard error handling. - * Will be removed once all uses have been switched to the new addElement(). - */ - void addElementX(void* obj, UErrorCode &status); - /** * Add an element at the end of the vector. * For use only with vectors that do not adopt their elements, which is to say, @@ -197,12 +191,6 @@ public: inline UBool isEmpty(void) const {return count == 0;} - /* - * Old version of ensureCapacity, with non-standard error handling. - * Will be removed once all uses have been switched to the new ensureCapacity(). - */ - UBool ensureCapacityX(int32_t minimumCapacity, UErrorCode &status); - UBool ensureCapacity(int32_t minimumCapacity, UErrorCode &status); /** diff --git a/thirdparty/icu4c/common/uvectr32.cpp b/thirdparty/icu4c/common/uvectr32.cpp index a77ecb689f..2b4d0b8a75 100644 --- a/thirdparty/icu4c/common/uvectr32.cpp +++ b/thirdparty/icu4c/common/uvectr32.cpp @@ -83,7 +83,7 @@ void UVector32::assign(const UVector32& other, UErrorCode &ec) { } -bool UVector32::operator==(const UVector32& other) { +bool UVector32::operator==(const UVector32& other) const { int32_t i; if (count != other.count) return false; for (i=0; i