Merge pull request #61124 from bruvzg/icu_hb_ft_update

Update HarfBuzz, ICU and FreeType.
author: Rémi Verschelde <remi@verschelde.fr> 2022-05-17 19:00:41 +0200
committer: GitHub <noreply@github.com> 2022-05-17 19:00:41 +0200
commit: abee814263c586d96b523b483bafb86d7a63aa58 (patch)
tree: 2d29c9b6ebaec415bdce2100e2319fcb651d8bbe /thirdparty/icu4c/common
parent: 7ea8cde9834b0fda4a928217b7880da3dd330214 (diff)
parent: 93fba7ead33b45a6f9904ab6a69ada72e8564230 (diff)
36 files changed, 556 insertions, 289 deletions
diff --git a/thirdparty/icu4c/common/brkeng.cpp b/thirdparty/icu4c/common/brkeng.cpp
index 52e9c53621..dc9fb99bf1 100644
--- a/thirdparty/icu4c/common/brkeng.cpp
+++ b/thirdparty/icu4c/common/brkeng.cpp
@@ -79,6 +79,7 @@ UnhandledEngine::findBreaks( UText *text,
                              int32_t /* startPos */,
                              int32_t endPos,
                              UVector32 &/*foundBreaks*/,
+                             UBool /* isPhraseBreaking */,
                              UErrorCode &status) const {
     if (U_FAILURE(status)) return 0;
     UChar32 c = utext_current32(text); 
diff --git a/thirdparty/icu4c/common/brkeng.h b/thirdparty/icu4c/common/brkeng.h
index 6843f1cc95..127ba59e18 100644
--- a/thirdparty/icu4c/common/brkeng.h
+++ b/thirdparty/icu4c/common/brkeng.h
@@ -75,6 +75,7 @@ class LanguageBreakEngine : public UMemory {
                               int32_t startPos,
                               int32_t endPos,
                               UVector32 &foundBreaks,
+                              UBool isPhraseBreaking,
                               UErrorCode &status) const = 0;
 
 };
@@ -194,6 +195,7 @@ class UnhandledEngine : public LanguageBreakEngine {
                               int32_t startPos,
                               int32_t endPos,
                               UVector32 &foundBreaks,
+                              UBool isPhraseBreaking,
                               UErrorCode &status) const override;
 
  /**
diff --git a/thirdparty/icu4c/common/brkiter.cpp b/thirdparty/icu4c/common/brkiter.cpp
index 8b228acf2c..8a1915880e 100644
--- a/thirdparty/icu4c/common/brkiter.cpp
+++ b/thirdparty/icu4c/common/brkiter.cpp
@@ -30,6 +30,7 @@
 #include "unicode/ures.h"
 #include "unicode/ustring.h"
 #include "unicode/filteredbrk.h"
+#include "bytesinkutil.h"
 #include "ucln_cmn.h"
 #include "cstring.h"
 #include "umutex.h"
@@ -115,7 +116,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
     }
 
     // Create a RuleBasedBreakIterator
-    result = new RuleBasedBreakIterator(file, status);
+    result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != NULL, status);
 
     // If there is a result, set the valid locale and actual locale, and the kind
     if (U_SUCCESS(status) && result != NULL) {
@@ -408,7 +409,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
     if (U_FAILURE(status)) {
         return NULL;
     }
-    char lbType[kKeyValueLenMax];
 
     BreakIterator *result = NULL;
     switch (kind) {
@@ -428,18 +428,29 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
         break;
     case UBRK_LINE:
         {
+            char lb_lw[kKeyValueLenMax];
             UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
-            uprv_strcpy(lbType, "line");
-            char lbKeyValue[kKeyValueLenMax] = {0};
+            uprv_strcpy(lb_lw, "line");
             UErrorCode kvStatus = U_ZERO_ERROR;
-            int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
-            if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
-                uprv_strcat(lbType, "_");
-                uprv_strcat(lbType, lbKeyValue);
+            CharString value;
+            CharStringByteSink valueSink(&value);
+            loc.getKeywordValue("lb", valueSink, kvStatus);
+            if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
+                uprv_strcat(lb_lw, "_");
+                uprv_strcat(lb_lw, value.data());
             }
-            result = BreakIterator::buildInstance(loc, lbType, status);
+            // lw=phrase is only supported in Japanese.
+            if (uprv_strcmp(loc.getLanguage(), "ja") == 0) {
+                value.clear();
+                loc.getKeywordValue("lw", valueSink, kvStatus);
+                if (U_SUCCESS(kvStatus) && value == "phrase") {
+                    uprv_strcat(lb_lw, "_");
+                    uprv_strcat(lb_lw, value.data());
+                }
+            }
+            result = BreakIterator::buildInstance(loc, lb_lw, status);
 
-            UTRACE_DATA1(UTRACE_INFO, "lb=%s", lbKeyValue);
+            UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
             UTRACE_EXIT_STATUS(status);
         }
         break;
diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp
index 4d158e3226..4fdbdf2760 100644
--- a/thirdparty/icu4c/common/dictbe.cpp
+++ b/thirdparty/icu4c/common/dictbe.cpp
@@ -17,7 +17,10 @@
 #include "dictbe.h"
 #include "unicode/uniset.h"
 #include "unicode/chariter.h"
+#include "unicode/resbund.h"
 #include "unicode/ubrk.h"
+#include "unicode/usetiter.h"
+#include "ubrkimpl.h"
 #include "utracimp.h"
 #include "uvectr32.h"
 #include "uvector.h"
@@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
                                  int32_t startPos,
                                  int32_t endPos,
                                  UVector32 &foundBreaks,
+                                 UBool isPhraseBreaking,
                                  UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     (void)startPos;            // TODO: remove this param?
@@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
     }
     rangeStart = start;
     rangeEnd = current;
-    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
+    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
     utext_setNativeIndex(text, current);
     
     return result;
@@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
-    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
+    UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
     if (U_SUCCESS(status)) {
-        setCharacters(fThaiWordSet);
+        setCharacters(thaiWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
     fMarkSet.add(0x0020);
-    fEndWordSet = fThaiWordSet;
+    fEndWordSet = thaiWordSet;
     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
@@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     utext_setNativeIndex(text, rangeStart);
@@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
-    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
+    UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
     if (U_SUCCESS(status)) {
-        setCharacters(fLaoWordSet);
+        setCharacters(laoWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
     fMarkSet.add(0x0020);
-    fEndWordSet = fLaoWordSet;
+    fEndWordSet = laoWordSet;
     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
@@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
@@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
-    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
+    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
+    fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.add(0x0020);
     if (U_SUCCESS(status)) {
-        setCharacters(fBurmeseWordSet);
+        setCharacters(fEndWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
-    fMarkSet.add(0x0020);
-    fEndWordSet = fBurmeseWordSet;
-    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
 
     // Compact for caching.
     fMarkSet.compact();
@@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status ) const {
     if (U_FAILURE(status)) return 0;
     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
@@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
-    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
+    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
     if (U_SUCCESS(status)) {
-        setCharacters(fKhmerWordSet);
+        setCharacters(khmerWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
     fMarkSet.add(0x0020);
-    fEndWordSet = fKhmerWordSet;
+    fEndWordSet = khmerWordSet;
     fBeginWordSet.add(0x1780, 0x17B3);
     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
@@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status ) const {
     if (U_FAILURE(status)) return 0;
     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
@@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
-    // Korean dictionary only includes Hangul syllables
-    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
-    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
-    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
-    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
-
-    if (U_SUCCESS(status)) {
-        // handle Korean and Japanese/Chinese using different dictionaries
-        if (type == kKorean) {
+    // Korean dictionary only includes Hangul syllables
+    fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
+    fHangulWordSet.compact();
+    // Digits, open puncutation and Alphabetic characters.
+    fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
+        UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
+    fDigitOrOpenPunctuationOrAlphabetSet.compact();
+    fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
+    fClosePunctuationSet.compact();
+
+    // handle Korean and Japanese/Chinese using different dictionaries
+    if (type == kKorean) {
+        if (U_SUCCESS(status)) {
             setCharacters(fHangulWordSet);
-        } else { //Chinese and Japanese
-            UnicodeSet cjSet;
-            cjSet.addAll(fHanWordSet);
-            cjSet.addAll(fKatakanaWordSet);
-            cjSet.addAll(fHiraganaWordSet);
-            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
-            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
+        }
+    } else { //Chinese and Japanese
+        UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
+        if (U_SUCCESS(status)) {
             setCharacters(cjSet);
+            initJapanesePhraseParameter(status);
         }
     }
     UTRACE_EXIT_STATUS(status);
@@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) {
             (value >= 0xFF66 && value <= 0xFF9f);
 }
 
-
 // Function for accessing internal utext flags.
 //   Replicates an internal UText function.
 
 static inline int32_t utext_i32_flag(int32_t bitIndex) {
     return (int32_t)1 << bitIndex;
 }
-
        
 /*
  * @param text A UText representing the text
@@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
         int32_t rangeStart,
         int32_t rangeEnd,
         UVector32 &foundBreaks,
+        UBool isPhraseBreaking,
         UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     if (rangeStart >= rangeEnd) {
@@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
         t_boundary.addElement(numCodePts, status);
         numBreaks++;
+    } else if (isPhraseBreaking) {
+        t_boundary.addElement(numCodePts, status);
+        if(U_SUCCESS(status)) {
+            numBreaks++;
+            int32_t prevIdx = numCodePts;
+
+            int32_t codeUnitIdx = -1;
+            int32_t prevCodeUnitIdx = -1;
+            int32_t length = -1;
+            for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
+                codeUnitIdx = inString.moveIndex32(0, i);
+                prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
+                // Calculate the length by using the code unit.
+                length = prevCodeUnitIdx - codeUnitIdx;
+                prevIdx = i;
+                // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
+                // characters don't occur.
+                if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
+                    && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
+                           || !isKatakana(inString.char32At(codeUnitIdx)))) {
+                    t_boundary.addElement(i, status);
+                    numBreaks++;
+                }
+            }
+        }
     } else {
         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
             t_boundary.addElement(i, status);
@@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     // while reversing t_boundary and pushing values to foundBreaks.
     int32_t prevCPPos = -1;
     int32_t prevUTextPos = -1;
-    for (int32_t i = numBreaks-1; i >= 0; i--) {
+    int32_t correctedNumBreaks = 0;
+    for (int32_t i = numBreaks - 1; i >= 0; i--) {
         int32_t cpPos = t_boundary.elementAti(i);
         U_ASSERT(cpPos > prevCPPos);
         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
@@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
         if (utextPos > prevUTextPos) {
             // Boundaries are added to foundBreaks output in ascending order.
             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
-            foundBreaks.push(utextPos, status);
+            // In phrase breaking, there has to be a breakpoint between Cj character and close
+            // punctuation.
+            // E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正
+            if (utextPos != rangeStart
+                || (isPhraseBreaking && utextPos > 0
+                       && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
+                foundBreaks.push(utextPos, status);
+                correctedNumBreaks++;
+            }
         } else {
             // Normalization expanded the input text, the dictionary found a boundary
             // within the expansion, giving two boundaries with the same index in the
@@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     }
     (void)prevCPPos; // suppress compiler warnings about unused variable
 
+    UChar32 nextChar = utext_char32At(inText, rangeEnd);
+    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+        // In phrase breaking, there has to be a breakpoint between Cj character and
+        // the number/open punctuation.
+        // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+        // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
+        // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
+        if (isPhraseBreaking) {
+            if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
+                foundBreaks.popi();
+                correctedNumBreaks--;
+            }
+        } else {
+            foundBreaks.popi();
+            correctedNumBreaks--;
+        }
+    }
+
     // inString goes out of scope
     // inputMap goes out of scope
-    return numBreaks;
+    return correctedNumBreaks;
+}
+
+void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
+    loadJapaneseExtensions(error);
+    loadHiragana(error);
+}
+
+void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
+    const char* tag = "extensions";
+    ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
+    if (U_SUCCESS(error)) {
+        ResourceBundle bundle = ja.get(tag, error);
+        while (U_SUCCESS(error) && bundle.hasNext()) {
+            fSkipSet.puti(bundle.getNextString(error), 1, error);
+        }
+    }
+}
+
+void CjkBreakEngine::loadHiragana(UErrorCode& error) {
+    UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);
+    hiraganaWordSet.compact();
+    UnicodeSetIterator iterator(hiraganaWordSet);
+    while (iterator.next()) {
+        fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);
+    }
 }
 #endif
 
diff --git a/thirdparty/icu4c/common/dictbe.h b/thirdparty/icu4c/common/dictbe.h
index 4e70ed3817..ca1a3c28b7 100644
--- a/thirdparty/icu4c/common/dictbe.h
+++ b/thirdparty/icu4c/common/dictbe.h
@@ -15,6 +15,7 @@
 #include "unicode/utext.h"
 
 #include "brkeng.h"
+#include "hash.h"
 #include "uvectr32.h"
 
 U_NAMESPACE_BEGIN
@@ -80,6 +81,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
                               int32_t startPos,
                               int32_t endPos,
                               UVector32 &foundBreaks,
+                              UBool isPhraseBreaking,
                               UErrorCode& status ) const override;
 
  protected:
@@ -105,6 +107,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
                                            int32_t rangeStart,
                                            int32_t rangeEnd,
                                            UVector32 &foundBreaks,
+                                           UBool isPhraseBreaking,
                                            UErrorCode& status) const = 0;
 
 };
@@ -127,7 +130,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
      * @internal
      */
 
-  UnicodeSet                fThaiWordSet;
   UnicodeSet                fEndWordSet;
   UnicodeSet                fBeginWordSet;
   UnicodeSet                fSuffixSet;
@@ -164,6 +166,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
                                            int32_t rangeStart,
                                            int32_t rangeEnd,
                                            UVector32 &foundBreaks,
+                                           UBool isPhraseBreaking,
                                            UErrorCode& status) const override;
 
 };
@@ -186,7 +189,6 @@ class LaoBreakEngine : public DictionaryBreakEngine {
      * @internal
      */
 
-  UnicodeSet                fLaoWordSet;
   UnicodeSet                fEndWordSet;
   UnicodeSet                fBeginWordSet;
   UnicodeSet                fMarkSet;
@@ -222,6 +224,7 @@ class LaoBreakEngine : public DictionaryBreakEngine {
                                            int32_t rangeStart,
                                            int32_t rangeEnd,
                                            UVector32 &foundBreaks,
+                                           UBool isPhraseBreaking,
                                            UErrorCode& status) const override;
 
 };
@@ -244,7 +247,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
      * @internal
      */
 
-  UnicodeSet                fBurmeseWordSet;
   UnicodeSet                fEndWordSet;
   UnicodeSet                fBeginWordSet;
   UnicodeSet                fMarkSet;
@@ -280,6 +282,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
                                            int32_t rangeStart,
                                            int32_t rangeEnd,
                                            UVector32 &foundBreaks,
+                                           UBool isPhraseBreaking,
                                            UErrorCode& status) const override;
 
 };
@@ -302,7 +305,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
      * @internal
      */
 
-  UnicodeSet                fKhmerWordSet;
   UnicodeSet                fEndWordSet;
   UnicodeSet                fBeginWordSet;
   UnicodeSet                fMarkSet;
@@ -338,6 +340,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
                                            int32_t rangeStart,
                                            int32_t rangeEnd,
                                            UVector32 &foundBreaks,
+                                           UBool isPhraseBreaking,
                                            UErrorCode& status) const override;
 
 };
@@ -366,13 +369,22 @@ class CjkBreakEngine : public DictionaryBreakEngine {
      * @internal
      */
   UnicodeSet                fHangulWordSet;
-  UnicodeSet                fHanWordSet;
-  UnicodeSet                fKatakanaWordSet;
-  UnicodeSet                fHiraganaWordSet;
+  UnicodeSet                fDigitOrOpenPunctuationOrAlphabetSet;
+  UnicodeSet                fClosePunctuationSet;
 
   DictionaryMatcher        *fDictionary;
   const Normalizer2        *nfkcNorm2;
 
+ private:
+  // Load Japanese extensions.
+  void loadJapaneseExtensions(UErrorCode& error);
+  // Load Japanese Hiragana.
+  void loadHiragana(UErrorCode& error);
+  // Initialize fSkipSet by loading Japanese Hiragana and extensions.
+  void initJapanesePhraseParameter(UErrorCode& error);
+
+  Hashtable fSkipSet;
+
  public:
 
     /**
@@ -404,6 +416,7 @@ class CjkBreakEngine : public DictionaryBreakEngine {
           int32_t rangeStart,
           int32_t rangeEnd,
           UVector32 &foundBreaks,
+          UBool isPhraseBreaking,
           UErrorCode& status) const override;
 
 };
diff --git a/thirdparty/icu4c/common/localematcher.cpp b/thirdparty/icu4c/common/localematcher.cpp
index 3d178dfbaf..2cad708d99 100644
--- a/thirdparty/icu4c/common/localematcher.cpp
+++ b/thirdparty/icu4c/common/localematcher.cpp
@@ -168,12 +168,9 @@ void LocaleMatcher::Builder::clearSupportedLocales() {
 bool LocaleMatcher::Builder::ensureSupportedLocaleVector() {
     if (U_FAILURE(errorCode_)) { return false; }
     if (supportedLocales_ != nullptr) { return true; }
-    supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_);
+    LocalPointer<UVector> lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_);
     if (U_FAILURE(errorCode_)) { return false; }
-    if (supportedLocales_ == nullptr) {
-        errorCode_ = U_MEMORY_ALLOCATION_ERROR;
-        return false;
-    }
+    supportedLocales_ = lpSupportedLocales.orphan();
     return true;
 }
 
@@ -187,9 +184,8 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin
     for (int32_t i = 0; i < length; ++i) {
         Locale *locale = list.orphanLocaleAt(i);
         if (locale == nullptr) { continue; }
-        supportedLocales_->addElementX(locale, errorCode_);
+        supportedLocales_->adoptElement(locale, errorCode_);
         if (U_FAILURE(errorCode_)) {
-            delete locale;
             break;
         }
     }
@@ -197,35 +193,21 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin
 }
 
 LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) {
-    if (U_FAILURE(errorCode_)) { return *this; }
-    clearSupportedLocales();
-    if (!ensureSupportedLocaleVector()) { return *this; }
-    while (locales.hasNext()) {
-        const Locale &locale = locales.next();
-        Locale *clone = locale.clone();
-        if (clone == nullptr) {
-            errorCode_ = U_MEMORY_ALLOCATION_ERROR;
-            break;
-        }
-        supportedLocales_->addElementX(clone, errorCode_);
-        if (U_FAILURE(errorCode_)) {
-            delete clone;
-            break;
+    if (ensureSupportedLocaleVector()) {
+        clearSupportedLocales();
+        while (locales.hasNext() && U_SUCCESS(errorCode_)) {
+            const Locale &locale = locales.next();
+            LocalPointer<Locale> clone (locale.clone(), errorCode_);
+            supportedLocales_->adoptElement(clone.orphan(), errorCode_);
         }
     }
     return *this;
 }
 
 LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) {
-    if (!ensureSupportedLocaleVector()) { return *this; }
-    Locale *clone = locale.clone();
-    if (clone == nullptr) {
-        errorCode_ = U_MEMORY_ALLOCATION_ERROR;
-        return *this;
-    }
-    supportedLocales_->addElementX(clone, errorCode_);
-    if (U_FAILURE(errorCode_)) {
-        delete clone;
+    if (ensureSupportedLocaleVector()) {
+        LocalPointer<Locale> clone(locale.clone(), errorCode_);
+        supportedLocales_->adoptElement(clone.orphan(), errorCode_);
     }
     return *this;
 }
diff --git a/thirdparty/icu4c/common/locid.cpp b/thirdparty/icu4c/common/locid.cpp
index e8859c7048..73bb8d8aec 100644
--- a/thirdparty/icu4c/common/locid.cpp
+++ b/thirdparty/icu4c/common/locid.cpp
@@ -1204,14 +1204,11 @@ AliasReplacer::parseLanguageReplacement(
     // We have multiple field so we have to allocate and parse
     CharString* str = new CharString(
         replacement, (int32_t)uprv_strlen(replacement), status);
+    LocalPointer<CharString> lpStr(str, status);
+    toBeFreed.adoptElement(lpStr.orphan(), status);
     if (U_FAILURE(status)) {
         return;
     }
-    if (str == nullptr) {
-        status = U_MEMORY_ALLOCATION_ERROR;
-        return;
-    }
-    toBeFreed.addElementX(str, status);
     char* data = str->data();
     replacedLanguage = (const char*) data;
     char* endOfField = uprv_strchr(data, '_');
@@ -1420,12 +1417,9 @@ AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status)
                                (int32_t)(firstSpace - replacement), status), status);
         }
         if (U_FAILURE(status)) { return false; }
-        if (item.isNull()) {
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return false;
-        }
         replacedRegion = item->data();
-        toBeFreed.addElementX(item.orphan(), status);
+        toBeFreed.adoptElement(item.orphan(), status);
+        if (U_FAILURE(status)) { return false; }
     }
     U_ASSERT(!same(region, replacedRegion));
     region = replacedRegion;
@@ -1659,10 +1653,10 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
         while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr &&
                U_SUCCESS(status)) {
             *end = NULL_CHAR;  // null terminate inside variantsBuff
-            variants.addElementX(start, status);
+            variants.addElement(start, status);
             start = end + 1;
         }
-        variants.addElementX(start, status);
+        variants.addElement(start, status);
     }
     if (U_FAILURE(status)) { return false; }
 
diff --git a/thirdparty/icu4c/common/lstmbe.cpp b/thirdparty/icu4c/common/lstmbe.cpp
index 3793abceb3..f6114cdfe2 100644
--- a/thirdparty/icu4c/common/lstmbe.cpp
+++ b/thirdparty/icu4c/common/lstmbe.cpp
@@ -1,8 +1,8 @@
 // © 2021 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 
+#include <complex>
 #include <utility>
-#include <ctgmath>
 
 #include "unicode/utypes.h"
 
@@ -639,6 +639,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t startPos,
                                                 int32_t endPos,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     int32_t beginFoundBreakSize = foundBreaks.size();
diff --git a/thirdparty/icu4c/common/lstmbe.h b/thirdparty/icu4c/common/lstmbe.h
index c3f7ecf815..ffdf805eca 100644
--- a/thirdparty/icu4c/common/lstmbe.h
+++ b/thirdparty/icu4c/common/lstmbe.h
@@ -62,6 +62,7 @@ protected:
                                              int32_t rangeStart,
                                              int32_t rangeEnd,
                                              UVector32 &foundBreaks,
+                                             UBool isPhraseBreaking,
                                              UErrorCode& status) const override;
 private:
     const LSTMData* fData;
diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp
index 5bfd49e8cb..e6bd75e717 100644
--- a/thirdparty/icu4c/common/normalizer2impl.cpp
+++ b/thirdparty/icu4c/common/normalizer2impl.cpp
@@ -2496,15 +2496,18 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode
         // origin is not the first character, or it is U+0000.
         UnicodeSet *set;
         if((canonValue&CANON_HAS_SET)==0) {
-            set=new UnicodeSet;
-            if(set==NULL) {
-                errorCode=U_MEMORY_ALLOCATION_ERROR;
+            LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode);
+            set=lpSet.getAlias();
+            if(U_FAILURE(errorCode)) {
                 return;
             }
             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
             umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
-            canonStartSets.addElementX(set, errorCode);
+            canonStartSets.adoptElement(lpSet.orphan(), errorCode);
+            if (U_FAILURE(errorCode)) {
+                return;
+            }
             if(firstOrigin!=0) {
                 set->add(firstOrigin);
             }
diff --git a/thirdparty/icu4c/common/rbbi.cpp b/thirdparty/icu4c/common/rbbi.cpp
index f65177f232..cae8d154b3 100644
--- a/thirdparty/icu4c/common/rbbi.cpp
+++ b/thirdparty/icu4c/common/rbbi.cpp
@@ -82,6 +82,19 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
     }
 }
 
+//-------------------------------------------------------------------------------
+//
+//   Constructor   from a UDataMemory handle to precompiled break rules
+//                 stored in an ICU data file. This construcotr is private API,
+//                 only for internal use.
+//
+//-------------------------------------------------------------------------------
+RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
+        UErrorCode &status) : RuleBasedBreakIterator(udm, status)
+{
+    fIsPhraseBreaking = isPhraseBreaking;
+}
+
 //
 //  Construct from precompiled binary rules (tables).  This constructor is public API,
 //  taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
@@ -322,6 +335,7 @@ void RuleBasedBreakIterator::init(UErrorCode &status) {
     fBreakCache           = nullptr;
     fDictionaryCache      = nullptr;
     fLookAheadMatches     = nullptr;
+    fIsPhraseBreaking     = false;
 
     // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
     // fText                 = UTEXT_INITIALIZER;
diff --git a/thirdparty/icu4c/common/rbbi_cache.cpp b/thirdparty/icu4c/common/rbbi_cache.cpp
index 6bfe3feca4..26d82df781 100644
--- a/thirdparty/icu4c/common/rbbi_cache.cpp
+++ b/thirdparty/icu4c/common/rbbi_cache.cpp
@@ -163,7 +163,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
         // Ask the language object if there are any breaks. It will add them to the cache and
         // leave the text pointer on the other side of its range, ready to search for the next one.
         if (lbe != NULL) {
-            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, status);
+            foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
         }
 
         // Reload the loop variables for the next go-round
diff --git a/thirdparty/icu4c/common/serv.cpp b/thirdparty/icu4c/common/serv.cpp
index 0c54a4dce9..c26dbca1a9 100644
--- a/thirdparty/icu4c/common/serv.cpp
+++ b/thirdparty/icu4c/common/serv.cpp
@@ -625,10 +625,7 @@ ICUService::getVisibleIDs(UVector& result, const UnicodeString* matchID, UErrorC
                     }
                 }
 
-                LocalPointer<UnicodeString> idClone(new UnicodeString(*id), status);
-                if (U_SUCCESS(status) && idClone->isBogus()) {
-                    status = U_MEMORY_ALLOCATION_ERROR;
-                }
+                LocalPointer<UnicodeString> idClone(id->clone(), status);
                 result.adoptElement(idClone.orphan(), status);
             }
             delete fallbackKey;
diff --git a/thirdparty/icu4c/common/servls.cpp b/thirdparty/icu4c/common/servls.cpp
index 7108afd4a5..98f0a8a12b 100644
--- a/thirdparty/icu4c/common/servls.cpp
+++ b/thirdparty/icu4c/common/servls.cpp
@@ -179,7 +179,8 @@ private:
 
             length = other._ids.size();
             for(i = 0; i < length; ++i) {
-                _ids.addElementX(((UnicodeString *)other._ids.elementAt(i))->clone(), status);
+                LocalPointer<UnicodeString> clonedId(((UnicodeString *)other._ids.elementAt(i))->clone(), status);
+                _ids.adoptElement(clonedId.orphan(), status);
             }
 
             if(U_SUCCESS(status)) {
diff --git a/thirdparty/icu4c/common/servnotf.cpp b/thirdparty/icu4c/common/servnotf.cpp
index 342e0d9f24..d9fb388752 100644
--- a/thirdparty/icu4c/common/servnotf.cpp
+++ b/thirdparty/icu4c/common/servnotf.cpp
@@ -49,7 +49,11 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status)
         if (acceptsListener(*l)) {
             Mutex lmx(&notifyLock);
             if (listeners == NULL) {
-                listeners = new UVector(5, status);
+                LocalPointer<UVector> lpListeners(new UVector(5, status), status);
+                if (U_FAILURE(status)) {
+                    return;
+                }
+                listeners = lpListeners.orphan();
             } else {
                 for (int i = 0, e = listeners->size(); i < e; ++i) {
                     const EventListener* el = (const EventListener*)(listeners->elementAt(i));
@@ -59,7 +63,7 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status)
                 }
             }
 
-            listeners->addElementX((void*)l, status); // cast away const
+            listeners->addElement((void*)l, status); // cast away const
         }
 #ifdef NOTIFIER_DEBUG
         else {
@@ -102,13 +106,11 @@ ICUNotifier::removeListener(const EventListener *l, UErrorCode& status)
 void 
 ICUNotifier::notifyChanged(void) 
 {
+    Mutex lmx(&notifyLock);
     if (listeners != NULL) {
-        Mutex lmx(&notifyLock);
-        if (listeners != NULL) {
-            for (int i = 0, e = listeners->size(); i < e; ++i) {
-                EventListener* el = (EventListener*)listeners->elementAt(i);
-                notifyListener(*el);
-            }
+        for (int i = 0, e = listeners->size(); i < e; ++i) {
+            EventListener* el = (EventListener*)listeners->elementAt(i);
+            notifyListener(*el);
         }
     }
 }
diff --git a/thirdparty/icu4c/common/ubrk.cpp b/thirdparty/icu4c/common/ubrk.cpp
index bb5bdd1b50..f4e064961f 100644
--- a/thirdparty/icu4c/common/ubrk.cpp
+++ b/thirdparty/icu4c/common/ubrk.cpp
@@ -168,7 +168,7 @@ ubrk_safeClone(
     BreakIterator *newBI = ((BreakIterator *)bi)->clone();
     if (newBI == NULL) {
         *status = U_MEMORY_ALLOCATION_ERROR;
-    } else {
+    } else if (pBufferSize != NULL) {
         *status = U_SAFECLONE_ALLOCATED_WARNING;
     }
     return (UBreakIterator *)newBI;
@@ -176,15 +176,7 @@ ubrk_safeClone(
 
 U_CAPI UBreakIterator * U_EXPORT2
 ubrk_clone(const UBreakIterator *bi, UErrorCode *status) {
-    if (U_FAILURE(*status)) {
-        return nullptr;
-    }
-    BreakIterator *newBI = ((BreakIterator *)bi)->clone();
-    if (newBI == nullptr) {
-        *status = U_MEMORY_ALLOCATION_ERROR;
-        return nullptr;
-    }
-    return (UBreakIterator *)newBI;
+    return ubrk_safeClone(bi, nullptr, nullptr, status);
 }
 
 
diff --git a/thirdparty/icu4c/common/ucase.cpp b/thirdparty/icu4c/common/ucase.cpp
index 4aa856507a..388c86b1bb 100644
--- a/thirdparty/icu4c/common/ucase.cpp
+++ b/thirdparty/icu4c/common/ucase.cpp
@@ -22,27 +22,14 @@
 #include "unicode/utypes.h"
 #include "unicode/unistr.h"
 #include "unicode/uset.h"
-#include "unicode/udata.h" /* UDataInfo */
 #include "unicode/utf16.h"
-#include "ucmndata.h" /* DataHeader */
-#include "udatamem.h"
-#include "umutex.h"
-#include "uassert.h"
 #include "cmemory.h"
-#include "utrie2.h"
+#include "uassert.h"
 #include "ucase.h"
+#include "umutex.h"
+#include "utrie2.h"
 
-struct UCaseProps {
-    UDataMemory *mem;
-    const int32_t *indexes;
-    const uint16_t *exceptions;
-    const uint16_t *unfold;
-
-    UTrie2 trie;
-    uint8_t formatVersion[4];
-};
-
-/* ucase_props_data.h is machine-generated by gencase --csource */
+/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
 #define INCLUDED_FROM_UCASE_CPP
 #include "ucase_props_data.h"
 
@@ -77,6 +64,13 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
 
 /* data access primitives --------------------------------------------------- */
 
+U_CAPI const struct UCaseProps * U_EXPORT2
+ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
+    *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
+    *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
+    return &ucase_props_singleton;
+}
+
 U_CFUNC const UTrie2 * U_EXPORT2
 ucase_getTrie() {
     return &ucase_props_singleton.trie;
@@ -690,7 +684,7 @@ ucase_isCaseSensitive(UChar32 c) {
  *   - The general category of C is
  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
- *   - C is one of the following characters 
+ *   - C is one of the following characters
  *     U+0027 APOSTROPHE
  *     U+00AD SOFT HYPHEN (SHY)
  *     U+2019 RIGHT SINGLE QUOTATION MARK
@@ -1064,6 +1058,8 @@ ucase_toFullLower(UChar32 c,
     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
     U_ASSERT(c >= 0);
     UChar32 result=c;
+    // Reset the output pointer in case it was uninitialized.
+    *pString=nullptr;
     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     if(!UCASE_HAS_EXCEPTION(props)) {
         if(UCASE_IS_UPPER_OR_TITLE(props)) {
@@ -1148,7 +1144,6 @@ ucase_toFullLower(UChar32 c,
                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
                  */
-                *pString=nullptr;
                 return 0; /* remove the dot (continue without output) */
             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
                 /*
@@ -1215,6 +1210,8 @@ toUpperOrTitle(UChar32 c,
     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
     U_ASSERT(c >= 0);
     UChar32 result=c;
+    // Reset the output pointer in case it was uninitialized.
+    *pString=nullptr;
     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     if(!UCASE_HAS_EXCEPTION(props)) {
         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
@@ -1252,7 +1249,6 @@ toUpperOrTitle(UChar32 c,
 
                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
                  */
-                *pString=nullptr;
                 return 0; /* remove the dot (continue without output) */
             } else if(c==0x0587) {
                 // See ICU-13416:
@@ -1449,6 +1445,8 @@ ucase_toFullFolding(UChar32 c,
     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
     U_ASSERT(c >= 0);
     UChar32 result=c;
+    // Reset the output pointer in case it was uninitialized.
+    *pString=nullptr;
     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     if(!UCASE_HAS_EXCEPTION(props)) {
         if(UCASE_IS_UPPER_OR_TITLE(props)) {
@@ -1542,7 +1540,7 @@ U_CAPI UChar32 U_EXPORT2
 u_tolower(UChar32 c) {
     return ucase_tolower(c);
 }
-    
+
 /* Transforms the Unicode character to its upper case equivalent.*/
 U_CAPI UChar32 U_EXPORT2
 u_toupper(UChar32 c) {
diff --git a/thirdparty/icu4c/common/ucase.h b/thirdparty/icu4c/common/ucase.h
index a018f82b81..7bf57fd370 100644
--- a/thirdparty/icu4c/common/ucase.h
+++ b/thirdparty/icu4c/common/ucase.h
@@ -312,6 +312,21 @@ UCaseMapFull(UChar32 c,
 
 U_CDECL_END
 
+/* for icuexportdata -------------------------------------------------------- */
+
+struct UCaseProps {
+    void *mem;  // TODO: was unused, and type UDataMemory -- remove
+    const int32_t *indexes;
+    const uint16_t *exceptions;
+    const uint16_t *unfold;
+
+    UTrie2 trie;
+    uint8_t formatVersion[4];
+};
+
+U_CAPI const struct UCaseProps * U_EXPORT2
+ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength);
+
 /* file definitions --------------------------------------------------------- */
 
 #define UCASE_DATA_NAME "ucase"
diff --git a/thirdparty/icu4c/common/ucasemap.cpp b/thirdparty/icu4c/common/ucasemap.cpp
index ed72bda828..95b55d56a0 100644
--- a/thirdparty/icu4c/common/ucasemap.cpp
+++ b/thirdparty/icu4c/common/ucasemap.cpp
@@ -112,8 +112,7 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
     if(length==sizeof(csm->locale)) {
         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     }
-    if(U_SUCCESS(*pErrorCode)) {
-        csm->caseLocale=UCASE_LOC_UNKNOWN;
+    if(U_SUCCESS(*pErrorCode)) {     
         csm->caseLocale = ucase_getCaseLocale(csm->locale);
     } else {
         csm->locale[0]=0;
@@ -420,6 +419,97 @@ void toUpper(int32_t caseLocale, uint32_t options,
 
 #if !UCONFIG_NO_BREAK_ITERATION
 
+namespace {
+
+constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
+
+constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
+                          ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
+    U_ASSERT(start < segmentLimit);
+
+    int32_t index = start;
+    bool withAcute = false;
+
+    // If the conditions are met, then the following variables tell us what to output.
+    int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
+    bool doTitleJ = false;  // true if the j needs to be titlecased
+    int32_t unchanged2 = 0;  // after the j (0 or 1)
+
+    // next character after the first letter
+    UChar32 c2;
+    c2 = src[index++];
+
+    // Is the first letter an i/I with accent?
+    if (c == u'I') {
+        if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
+            withAcute = true;
+            unchanged1 = 2;  // ACUTE is 2 code units in UTF-8
+            if (index == segmentLimit) { return start; }
+            c2 = src[index++];
+        }
+    } else {  // Í
+        withAcute = true;
+    }
+
+    // Is the next character a j/J?
+    if (c2 == u'j') {
+        doTitleJ = true;
+    } else if (c2 == u'J') {
+        ++unchanged1;
+    } else {
+        return start;
+    }
+
+    // A plain i/I must be followed by a plain j/J.
+    // An i/I with acute must be followed by a j/J with acute.
+    if (withAcute) {
+        if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
+            return start;
+        }
+        if (doTitleJ) {
+            unchanged2 = 2;  // ACUTE is 2 code units in UTF-8
+        } else {
+            unchanged1 = unchanged1 + 2;    // ACUTE is 2 code units in UTF-8
+        }
+    }
+
+    // There must not be another combining mark.
+    if (index < segmentLimit) {
+        int32_t cp;
+        int32_t i = index;
+        U8_NEXT(src, i, segmentLimit, cp);
+        uint32_t typeMask = U_GET_GC_MASK(cp);
+        if ((typeMask & U_GC_M_MASK) != 0) {
+            return start;
+        }
+    }
+
+    // Output the rest of the Dutch IJ.
+    ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
+    start += unchanged1;
+    if (doTitleJ) {
+        ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
+        ++start;
+    }
+    ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
+
+    U_ASSERT(start + unchanged2 == index);
+    return index;
+}
+
+}  // namespace
+
 U_CFUNC void U_CALLCONV
 ucasemap_internalUTF8ToTitle(
         int32_t caseLocale, uint32_t options, BreakIterator *iter,
@@ -504,19 +594,14 @@ ucasemap_internalUTF8ToTitle(
                 }
 
                 /* Special case Dutch IJ titlecasing */
-                if (titleStart+1 < index &&
-                        caseLocale == UCASE_LOC_DUTCH &&
-                        (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
-                    if (src[titleStart+1] == 0x006A) {
-                        ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
-                        titleLimit++;
-                    } else if (src[titleStart+1] == 0x004A) {
-                        // Keep the capital J from getting lowercased.
-                        if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
-                                                           sink, options, edits, errorCode)) {
-                            return;
-                        }
-                        titleLimit++;
+                if (titleLimit < index &&
+                    caseLocale == UCASE_LOC_DUTCH) {
+                    if (c < 0) {
+                        c = ~c;
+                    }
+
+                    if (c == u'I' || c == u'Í') {
+                        titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
                     }
                 }
 
diff --git a/thirdparty/icu4c/common/ucnv.cpp b/thirdparty/icu4c/common/ucnv.cpp
index 5dcf35e043..019bcb6a79 100644
--- a/thirdparty/icu4c/common/ucnv.cpp
+++ b/thirdparty/icu4c/common/ucnv.cpp
@@ -252,7 +252,10 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
             UTRACE_EXIT_STATUS(*status);
             return NULL;
         }
-        *status = U_SAFECLONE_ALLOCATED_WARNING;
+        // If pBufferSize was NULL as the input, pBufferSize is set to &stackBufferSize in this function.
+        if (pBufferSize != &stackBufferSize) {
+            *status = U_SAFECLONE_ALLOCATED_WARNING;
+        }
 
         /* record the fact that memory was allocated */
         *pBufferSize = bufferSizeNeeded;
@@ -317,7 +320,11 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
     return localConverter;
 }
 
-
+U_CAPI UConverter* U_EXPORT2
+ucnv_clone(const UConverter* cnv, UErrorCode *status)
+{
+    return ucnv_safeClone(cnv, nullptr, nullptr, status);
+}
 
 /*Decreases the reference counter in the shared immutable section of the object
  *and frees the mutable part*/
diff --git a/thirdparty/icu4c/common/ucurr.cpp b/thirdparty/icu4c/common/ucurr.cpp
index 67aab4e8ff..6e489e0563 100644
--- a/thirdparty/icu4c/common/ucurr.cpp
+++ b/thirdparty/icu4c/common/ucurr.cpp
@@ -254,7 +254,7 @@ currSymbolsEquiv_cleanup(void)
 }
 
 /**
- * Deleter for OlsonToMetaMappingEntry
+ * Deleter for IsoCodeEntry
  */
 static void U_CALLCONV
 deleteIsoCodeEntry(void *obj) {
diff --git a/thirdparty/icu4c/common/uloc.cpp b/thirdparty/icu4c/common/uloc.cpp
index c8a3f1ff73..99c6a0af39 100644
--- a/thirdparty/icu4c/common/uloc.cpp
+++ b/thirdparty/icu4c/common/uloc.cpp
@@ -186,10 +186,10 @@ NULL
 };
 
 static const char* const DEPRECATED_LANGUAGES[]={
-    "in", "iw", "ji", "jw", NULL, NULL
+    "in", "iw", "ji", "jw", "mo", NULL, NULL
 };
 static const char* const REPLACEMENT_LANGUAGES[]={
-    "id", "he", "yi", "jv", NULL, NULL
+    "id", "he", "yi", "jv", "ro", NULL, NULL
 };
 
 /**
@@ -444,7 +444,7 @@ static const char * const COUNTRIES_3[] = {
 /*  "VA",  "VC",  "VE",  "VG",  "VI",  "VN",  "VU",  "WF",     */
     "VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
 /*  "WS",  "XK",  "YE",  "YT",  "ZA",  "ZM",  "ZW",          */
-    "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
+    "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
 NULL,
 /*  "AN",  "BU",  "CS",  "FX",  "RO", "SU",  "TP",  "YD",  "YU",  "ZR" */
     "ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
diff --git a/thirdparty/icu4c/common/unicode/localematcher.h b/thirdparty/icu4c/common/unicode/localematcher.h
index 252bb7fdc2..0f7e04a3af 100644
--- a/thirdparty/icu4c/common/unicode/localematcher.h
+++ b/thirdparty/icu4c/common/unicode/localematcher.h
@@ -461,13 +461,13 @@ public:
          * Option for whether to include or ignore one-way (fallback) match data.
          * By default, they are included.
          *
-         * @param direction the match direction to set.
+         * @param matchDirection the match direction to set.
          * @return this Builder object
          * @stable ICU 67
          */
-        Builder &setDirection(ULocMatchDirection direction) {
+        Builder &setDirection(ULocMatchDirection matchDirection) {
             if (U_SUCCESS(errorCode_)) {
-                direction_ = direction;
+                direction_ = matchDirection;
             }
             return *this;
         }
diff --git a/thirdparty/icu4c/common/unicode/rbbi.h b/thirdparty/icu4c/common/unicode/rbbi.h
index 0ce93819f5..0bad0d3897 100644
--- a/thirdparty/icu4c/common/unicode/rbbi.h
+++ b/thirdparty/icu4c/common/unicode/rbbi.h
@@ -147,6 +147,11 @@ private:
      */
     int32_t *fLookAheadMatches;
 
+    /**
+     *  A flag to indicate if phrase based breaking is enabled.
+     */
+    UBool fIsPhraseBreaking;
+
     //=======================================================================
     // constructors
     //=======================================================================
@@ -163,6 +168,21 @@ private:
      */
     RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
 
+    /**
+     * This constructor uses the udata interface to create a BreakIterator
+     * whose internal tables live in a memory-mapped file.  "image" is an
+     * ICU UDataMemory handle for the pre-compiled break iterator tables.
+     * @param image handle to the memory image for the break iterator data.
+     *        Ownership of the UDataMemory handle passes to the Break Iterator,
+     *        which will be responsible for closing it when it is no longer needed.
+     * @param status Information on any errors encountered.
+     * @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
+     * @see udata_open
+     * @see #getBinaryRules
+     * @internal (private)
+     */
+    RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
+
     /** @internal */
     friend class RBBIRuleBuilder;
     /** @internal */
diff --git a/thirdparty/icu4c/common/unicode/ubrk.h b/thirdparty/icu4c/common/unicode/ubrk.h
index c603f7c13f..2b3dc7aa57 100644
--- a/thirdparty/icu4c/common/unicode/ubrk.h
+++ b/thirdparty/icu4c/common/unicode/ubrk.h
@@ -312,11 +312,12 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
  *  If *pBufferSize is not enough for a stack-based safe clone,
  *  new memory will be allocated.
  * @param status to indicate whether the operation went on smoothly or there were errors
- *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
+ *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used
+ * if pBufferSize != NULL and any allocations were necessary
  * @return pointer to the new clone
  * @deprecated ICU 69 Use ubrk_clone() instead.
  */
-U_CAPI UBreakIterator * U_EXPORT2
+U_DEPRECATED UBreakIterator * U_EXPORT2
 ubrk_safeClone(
           const UBreakIterator *bi,
           void *stackBuffer,
@@ -325,21 +326,17 @@ ubrk_safeClone(
 
 #endif /* U_HIDE_DEPRECATED_API */
 
-#ifndef U_HIDE_DRAFT_API
-
 /**
  * Thread safe cloning operation.
  * @param bi iterator to be cloned
  * @param status to indicate whether the operation went on smoothly or there were errors
  * @return pointer to the new clone
- * @draft ICU 69
+ * @stable ICU 69
  */
 U_CAPI UBreakIterator * U_EXPORT2
 ubrk_clone(const UBreakIterator *bi,
            UErrorCode *status);
 
-#endif  // U_HIDE_DRAFT_API
-
 #ifndef U_HIDE_DEPRECATED_API
 
 /**
diff --git a/thirdparty/icu4c/common/unicode/ucnv.h b/thirdparty/icu4c/common/unicode/ucnv.h
index 2687c984d4..20c173b662 100644
--- a/thirdparty/icu4c/common/unicode/ucnv.h
+++ b/thirdparty/icu4c/common/unicode/ucnv.h
@@ -477,7 +477,7 @@ ucnv_openCCSID(int32_t codepage,
  *
  * <p>The name will NOT be looked up in the alias mechanism, nor will the converter be
  * stored in the converter cache or the alias table. The only way to open further converters
- * is call this function multiple times, or use the ucnv_safeClone() function to clone a
+ * is call this function multiple times, or use the ucnv_clone() function to clone a
  * 'primary' converter.</p>
  *
  * <p>A future version of ICU may add alias table lookups and/or caching
@@ -493,7 +493,7 @@ ucnv_openCCSID(int32_t codepage,
  * @return the created Unicode converter object, or <TT>NULL</TT> if an error occurred
  * @see udata_open
  * @see ucnv_open
- * @see ucnv_safeClone
+ * @see ucnv_clone
  * @see ucnv_close
  * @stable ICU 2.2
  */
@@ -502,6 +502,20 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode
 
 /**
  * Thread safe converter cloning operation.
+ *
+ * You must ucnv_close() the clone.
+ *
+ * @param cnv converter to be cloned
+ * @param status to indicate whether the operation went on smoothly or there were errors
+ * @return pointer to the new clone
+ * @stable ICU 71
+ */
+U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *status);
+
+#ifndef U_HIDE_DEPRECATED_API
+
+/**
+ * Thread safe converter cloning operation.
  * For most efficient operation, pass in a stackBuffer (and a *pBufferSize)
  * with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space.
  * If the buffer size is sufficient, then the clone will use the stack buffer;
@@ -532,21 +546,19 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode
  *  pointer to size of allocated space.
  * @param status to indicate whether the operation went on smoothly or there were errors
  *  An informational status value, U_SAFECLONE_ALLOCATED_WARNING,
- *  is used if any allocations were necessary.
+ *  is used if pBufferSize != NULL and any allocations were necessary
  *  However, it is better to check if *pBufferSize grew for checking for
  *  allocations because warning codes can be overridden by subsequent
  *  function calls.
  * @return pointer to the new clone
- * @stable ICU 2.0
+ * @deprecated ICU 71 Use ucnv_clone() instead.
  */
-U_CAPI UConverter * U_EXPORT2
+U_DEPRECATED UConverter * U_EXPORT2
 ucnv_safeClone(const UConverter *cnv,
                void             *stackBuffer,
                int32_t          *pBufferSize,
                UErrorCode       *status);
 
-#ifndef U_HIDE_DEPRECATED_API
-
 /**
  * \def U_CNV_SAFECLONE_BUFFERSIZE
  * Definition of a buffer size that is designed to be large enough for
diff --git a/thirdparty/icu4c/common/unicode/uniset.h b/thirdparty/icu4c/common/unicode/uniset.h
index 730337a353..310c7c8d20 100644
--- a/thirdparty/icu4c/common/unicode/uniset.h
+++ b/thirdparty/icu4c/common/unicode/uniset.h
@@ -1229,7 +1229,6 @@ public:
      */
     UnicodeSet& retain(UChar32 c);
 
-#ifndef U_HIDE_DRAFT_API
     /**
      * Retains only the specified string from this set if it is present.
      * Upon return this set will be empty if it did not contain s, or
@@ -1238,10 +1237,9 @@ public:
      *
      * @param s the source string
      * @return this object, for chaining
-     * @draft ICU 69
+     * @stable ICU 69
      */
     UnicodeSet& retain(const UnicodeString &s);
-#endif  // U_HIDE_DRAFT_API
 
     /**
      * Removes the specified range from this set if it is present.
diff --git a/thirdparty/icu4c/common/unicode/urename.h b/thirdparty/icu4c/common/unicode/urename.h
index 4605f632ea..d9f9b8f336 100644
--- a/thirdparty/icu4c/common/unicode/urename.h
+++ b/thirdparty/icu4c/common/unicode/urename.h
@@ -567,6 +567,7 @@
 #define ucase_addStringCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addStringCaseClosure)
 #define ucase_fold U_ICU_ENTRY_POINT_RENAME(ucase_fold)
 #define ucase_getCaseLocale U_ICU_ENTRY_POINT_RENAME(ucase_getCaseLocale)
+#define ucase_getSingleton U_ICU_ENTRY_POINT_RENAME(ucase_getSingleton)
 #define ucase_getTrie U_ICU_ENTRY_POINT_RENAME(ucase_getTrie)
 #define ucase_getType U_ICU_ENTRY_POINT_RENAME(ucase_getType)
 #define ucase_getTypeOrIgnorable U_ICU_ENTRY_POINT_RENAME(ucase_getTypeOrIgnorable)
@@ -630,6 +631,7 @@
 #define ucnv_cbFromUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteUChars)
 #define ucnv_cbToUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteSub)
 #define ucnv_cbToUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteUChars)
+#define ucnv_clone U_ICU_ENTRY_POINT_RENAME(ucnv_clone)
 #define ucnv_close U_ICU_ENTRY_POINT_RENAME(ucnv_close)
 #define ucnv_compareNames U_ICU_ENTRY_POINT_RENAME(ucnv_compareNames)
 #define ucnv_convert U_ICU_ENTRY_POINT_RENAME(ucnv_convert)
@@ -725,6 +727,7 @@
 #define ucnvsel_selectForString U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForString)
 #define ucnvsel_selectForUTF8 U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForUTF8)
 #define ucnvsel_serialize U_ICU_ENTRY_POINT_RENAME(ucnvsel_serialize)
+#define ucol_clone U_ICU_ENTRY_POINT_RENAME(ucol_clone)
 #define ucol_cloneBinary U_ICU_ENTRY_POINT_RENAME(ucol_cloneBinary)
 #define ucol_close U_ICU_ENTRY_POINT_RENAME(ucol_close)
 #define ucol_closeElements U_ICU_ENTRY_POINT_RENAME(ucol_closeElements)
@@ -904,6 +907,7 @@
 #define udatpg_getBestPattern U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPattern)
 #define udatpg_getBestPatternWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPatternWithOptions)
 #define udatpg_getDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormat)
+#define udatpg_getDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormatForStyle)
 #define udatpg_getDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_getDecimal)
 #define udatpg_getDefaultHourCycle U_ICU_ENTRY_POINT_RENAME(udatpg_getDefaultHourCycle)
 #define udatpg_getFieldDisplayName U_ICU_ENTRY_POINT_RENAME(udatpg_getFieldDisplayName)
@@ -918,6 +922,7 @@
 #define udatpg_setAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemFormat)
 #define udatpg_setAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemName)
 #define udatpg_setDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormat)
+#define udatpg_setDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormatForStyle)
 #define udatpg_setDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_setDecimal)
 #define udict_swap U_ICU_ENTRY_POINT_RENAME(udict_swap)
 #define udtitvfmt_close U_ICU_ENTRY_POINT_RENAME(udtitvfmt_close)
diff --git a/thirdparty/icu4c/common/unicode/uset.h b/thirdparty/icu4c/common/unicode/uset.h
index 2ef352ef56..33332f2d36 100644
--- a/thirdparty/icu4c/common/unicode/uset.h
+++ b/thirdparty/icu4c/common/unicode/uset.h
@@ -628,7 +628,6 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end);
 U_CAPI void U_EXPORT2
 uset_removeString(USet* set, const UChar* str, int32_t strLen);
 
-#ifndef U_HIDE_DRAFT_API
 /**
  * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
  * A frozen set will not be modified.
@@ -636,11 +635,10 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen);
  * @param set the object to be modified
  * @param str the string
  * @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
  */
 U_CAPI void U_EXPORT2
 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
-#endif  // U_HIDE_DRAFT_API
 
 /**
  * Removes from this set all of its elements that are contained in the
@@ -671,7 +669,6 @@ uset_removeAll(USet* set, const USet* removeSet);
 U_CAPI void U_EXPORT2
 uset_retain(USet* set, UChar32 start, UChar32 end);
 
-#ifndef U_HIDE_DRAFT_API
 /**
  * Retains only the specified string from this set if it is present.
  * Upon return this set will be empty if it did not contain s, or
@@ -681,7 +678,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end);
  * @param set the object to be modified
  * @param str the string
  * @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
  */
 U_CAPI void U_EXPORT2
 uset_retainString(USet *set, const UChar *str, int32_t length);
@@ -693,11 +690,10 @@ uset_retainString(USet *set, const UChar *str, int32_t length);
  * @param set the object to be modified
  * @param str the string
  * @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
  */
 U_CAPI void U_EXPORT2
 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
-#endif  // U_HIDE_DRAFT_API
 
 /**
  * Retains only the elements in this set that are contained in the
@@ -741,7 +737,6 @@ uset_compact(USet* set);
 U_CAPI void U_EXPORT2
 uset_complement(USet* set);
 
-#ifndef U_HIDE_DRAFT_API
 /**
  * Complements the specified range in this set.  Any character in
  * the range will be removed if it is in this set, or will be
@@ -753,7 +748,7 @@ uset_complement(USet* set);
  * @param set the object to be modified
  * @param start first character, inclusive, of range
  * @param end last character, inclusive, of range
- * @draft ICU 69
+ * @stable ICU 69
  */
 U_CAPI void U_EXPORT2
 uset_complementRange(USet *set, UChar32 start, UChar32 end);
@@ -766,7 +761,7 @@ uset_complementRange(USet *set, UChar32 start, UChar32 end);
  * @param set the object to be modified
  * @param str the string
  * @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
  */
 U_CAPI void U_EXPORT2
 uset_complementString(USet *set, const UChar *str, int32_t length);
@@ -778,11 +773,10 @@ uset_complementString(USet *set, const UChar *str, int32_t length);
  * @param set the object to be modified
  * @param str the string
  * @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
  */
 U_CAPI void U_EXPORT2
 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
-#endif  // U_HIDE_DRAFT_API
 
 /**
  * Complements in this set all elements contained in the specified
diff --git a/thirdparty/icu4c/common/unicode/uvernum.h b/thirdparty/icu4c/common/unicode/uvernum.h
index 42e8865d7e..2706e0b060 100644
--- a/thirdparty/icu4c/common/unicode/uvernum.h
+++ b/thirdparty/icu4c/common/unicode/uvernum.h
@@ -60,7 +60,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION_MAJOR_NUM 70
+#define U_ICU_VERSION_MAJOR_NUM 71
 
 /** The current ICU minor version as an integer.
  *  This value will change in the subsequent releases of ICU
@@ -86,7 +86,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.6
  */
-#define U_ICU_VERSION_SUFFIX _70
+#define U_ICU_VERSION_SUFFIX _71
 
 /**
  * \def U_DEF2_ICU_ENTRY_POINT_RENAME
@@ -139,7 +139,7 @@
  *  This value will change in the subsequent releases of ICU
  *  @stable ICU 2.4
  */
-#define U_ICU_VERSION "70.1"
+#define U_ICU_VERSION "71.1"
 
 /**
  * The current ICU library major version number as a string, for library name suffixes.
@@ -152,13 +152,13 @@
  *
  * @stable ICU 2.6
  */
-#define U_ICU_VERSION_SHORT "70"
+#define U_ICU_VERSION_SHORT "71"
 
 #ifndef U_HIDE_INTERNAL_API
 /** Data version in ICU4C.
  * @internal ICU 4.4 Internal Use Only
  **/
-#define U_ICU_DATA_VERSION "70.1"
+#define U_ICU_DATA_VERSION "71.1"
 #endif  /* U_HIDE_INTERNAL_API */
 
 /*===========================================================================
diff --git a/thirdparty/icu4c/common/unistr.cpp b/thirdparty/icu4c/common/unistr.cpp
index 077b4d6ef2..c18665928d 100644
--- a/thirdparty/icu4c/common/unistr.cpp
+++ b/thirdparty/icu4c/common/unistr.cpp
@@ -334,7 +334,8 @@ Replaceable::clone() const {
 // UnicodeString overrides clone() with a real implementation
 UnicodeString *
 UnicodeString::clone() const {
-  return new UnicodeString(*this);
+  LocalPointer<UnicodeString> clonedString(new UnicodeString(*this));
+  return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr;
 }
 
 //========================================
@@ -1976,7 +1977,12 @@ The vector deleting destructor is already a part of UObject,
 but defining it here makes sure that it is included with this object file.
 This makes sure that static library dependencies are kept to a minimum.
 */
+#if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
 static void uprv_UnicodeStringDummy(void) {
     delete [] (new UnicodeString[2]);
 }
+#pragma GCC diagnostic pop
+#endif
 #endif
diff --git a/thirdparty/icu4c/common/ustrcase.cpp b/thirdparty/icu4c/common/ustrcase.cpp
index 36b19e75f2..43910ea520 100644
--- a/thirdparty/icu4c/common/ustrcase.cpp
+++ b/thirdparty/icu4c/common/ustrcase.cpp
@@ -36,6 +36,12 @@
 #include "ustr_imp.h"
 #include "uassert.h"
 
+/**
+ * Code point for COMBINING ACUTE ACCENT
+ * @internal
+ */
+#define ACUTE u'\u0301'
+
 U_NAMESPACE_BEGIN
 
 namespace {
@@ -396,6 +402,94 @@ U_NAMESPACE_USE
 
 #if !UCONFIG_NO_BREAK_ITERATION
 
+namespace {
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit,
+                          UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
+                          icu::Edits *edits) {
+    U_ASSERT(start < segmentLimit);
+
+    int32_t index = start;
+    bool withAcute = false;
+
+    // If the conditions are met, then the following variables tell us what to output.
+    int32_t unchanged1 = 0;  // code units before the j, or the whole sequence (0..3)
+    bool doTitleJ = false;  // true if the j needs to be titlecased
+    int32_t unchanged2 = 0;  // after the j (0 or 1)
+
+    // next character after the first letter
+    UChar c2 = src[index++];
+
+    // Is the first letter an i/I with accent?
+    if (c == u'I') {
+        if (c2 == ACUTE) {
+            withAcute = true;
+            unchanged1 = 1;
+            if (index == segmentLimit) { return start; }
+            c2 = src[index++];
+        }
+    } else {  // Í
+        withAcute = true;
+    }
+
+    // Is the next character a j/J?
+    if (c2 == u'j') {
+        doTitleJ = true;
+    } else if (c2 == u'J') {
+        ++unchanged1;
+    } else {
+        return start;
+    }
+
+    // A plain i/I must be followed by a plain j/J.
+    // An i/I with acute must be followed by a j/J with acute.
+    if (withAcute) {
+        if (index == segmentLimit || src[index++] != ACUTE) { return start; }
+        if (doTitleJ) {
+            unchanged2 = 1;
+        } else {
+            ++unchanged1;
+        }
+    }
+
+    // There must not be another combining mark.
+    if (index < segmentLimit) {
+        int32_t cp;
+        int32_t i = index;
+        U16_NEXT(src, i, segmentLimit, cp);
+        uint32_t typeMask = U_GET_GC_MASK(cp);
+        if ((typeMask & U_GC_M_MASK) != 0) {
+            return start;
+        }
+    }
+
+    // Output the rest of the Dutch IJ.
+    destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
+    start += unchanged1;
+    if (doTitleJ) {
+        destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
+        if (edits != nullptr) {
+            edits->addReplace(1, 1);
+        }
+        ++start;
+    }
+    destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
+
+    U_ASSERT(start + unchanged2 == index);
+    return index;
+}
+
+}  // namespace
+
 U_CFUNC int32_t U_CALLCONV
 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
                          UChar *dest, int32_t destCapacity,
@@ -412,14 +506,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
     csc.limit=srcLength;
     int32_t destIndex=0;
     int32_t prev=0;
-    UBool isFirstIndex=TRUE;
+    bool isFirstIndex=true;
 
     /* titlecasing loop */
     while(prev<srcLength) {
         /* find next index where to titlecase */
         int32_t index;
         if(isFirstIndex) {
-            isFirstIndex=FALSE;
+            isFirstIndex=false;
             index=iter->first();
         } else {
             index=iter->next();
@@ -446,7 +540,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
                 // Stop with titleStart<titleLimit<=index
                 // if there is a character to be titlecased,
                 // or else stop with titleStart==titleLimit==index.
-                UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+                bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
                 while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
                     titleStart=titleLimit;
                     if(titleLimit==index) {
@@ -479,27 +573,15 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
 
                 /* Special case Dutch IJ titlecasing */
                 if (titleStart+1 < index &&
-                        caseLocale == UCASE_LOC_DUTCH &&
-                        (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
-                    if (src[titleStart+1] == 0x006A) {
-                        destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
-                        if(destIndex<0) {
-                            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-                            return 0;
-                        }
-                        if(edits!=NULL) {
-                            edits->addReplace(1, 1);
-                        }
-                        titleLimit++;
-                    } else if (src[titleStart+1] == 0x004A) {
-                        // Keep the capital J from getting lowercased.
-                        destIndex=appendUnchanged(dest, destIndex, destCapacity,
-                                                  src+titleStart+1, 1, options, edits);
-                        if(destIndex<0) {
-                            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-                            return 0;
-                        }
-                        titleLimit++;
+                        caseLocale == UCASE_LOC_DUTCH) {
+                    if (c < 0) {
+                        c = ~c;
+                    }
+
+                    if (c == u'I' || c == u'Í') {
+                        titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index, 
+                                                       dest, destIndex, destCapacity, options, 
+                                                       edits);
                     }
                 }
 
diff --git a/thirdparty/icu4c/common/uvector.cpp b/thirdparty/icu4c/common/uvector.cpp
index 4da8b864e1..844463921e 100644
--- a/thirdparty/icu4c/common/uvector.cpp
+++ b/thirdparty/icu4c/common/uvector.cpp
@@ -99,14 +99,6 @@ bool UVector::operator==(const UVector& other) const {
     return true;
 }
 
-// TODO: delete this function once all call sites have been migrated to the
-//       new addElement().
-void UVector::addElementX(void* obj, UErrorCode &status) {
-    if (ensureCapacityX(count + 1, status)) {
-        elements[count++].pointer = obj;
-    }
-}
-
 void UVector::addElement(void* obj, UErrorCode &status) {
     U_ASSERT(deleter == nullptr);
     if (ensureCapacity(count + 1, status)) {
@@ -331,38 +323,6 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const {
     return -1;
 }
 
-UBool UVector::ensureCapacityX(int32_t minimumCapacity, UErrorCode &status) {
-    if (minimumCapacity < 0) {
-        status = U_ILLEGAL_ARGUMENT_ERROR;
-        return FALSE;
-	}
-    if (capacity < minimumCapacity) {
-        if (capacity > (INT32_MAX - 1) / 2) {        	// integer overflow check
-        	status = U_ILLEGAL_ARGUMENT_ERROR;
-        	return FALSE;
-        }
-        int32_t newCap = capacity * 2;
-        if (newCap < minimumCapacity) {
-            newCap = minimumCapacity;
-        }
-        if (newCap > (int32_t)(INT32_MAX / sizeof(UElement))) {	// integer overflow check
-        	// We keep the original memory contents on bad minimumCapacity.
-        	status = U_ILLEGAL_ARGUMENT_ERROR;
-        	return FALSE;
-        }
-        UElement* newElems = (UElement *)uprv_realloc(elements, sizeof(UElement)*newCap);
-        if (newElems == nullptr) {
-            // We keep the original contents on the memory failure on realloc or bad minimumCapacity.
-            status = U_MEMORY_ALLOCATION_ERROR;
-            return FALSE;
-        }
-        elements = newElems;
-        capacity = newCap;
-    }
-    return TRUE;
-}
-
-
 UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
     if (U_FAILURE(status)) {
         return false;
@@ -370,7 +330,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
     if (minimumCapacity < 0) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return false;
-	}
+    }
     if (capacity < minimumCapacity) {
         if (capacity > (INT32_MAX - 1) / 2) {        	// integer overflow check
             status = U_ILLEGAL_ARGUMENT_ERROR;
@@ -396,6 +356,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
     }
     return true;
 }
+
 /**
  * Change the size of this vector as follows: If newSize is smaller,
  * then truncate the array, possibly deleting held elements for i >=
diff --git a/thirdparty/icu4c/common/uvector.h b/thirdparty/icu4c/common/uvector.h
index f61fcc2be6..1eb7d136e7 100644
--- a/thirdparty/icu4c/common/uvector.h
+++ b/thirdparty/icu4c/common/uvector.h
@@ -123,12 +123,6 @@ public:
     // java.util.Vector API
     //------------------------------------------------------------
 
-    /*
-     * Old version of addElement, with non-standard error handling.
-     * Will be removed once all uses have been switched to the new addElement().
-     */
-    void addElementX(void* obj, UErrorCode &status);
-
     /**
      * Add an element at the end of the vector.
      * For use only with vectors that do not adopt their elements, which is to say,
@@ -197,12 +191,6 @@ public:
 
     inline UBool isEmpty(void) const {return count == 0;}
 
-    /*
-     * Old version of ensureCapacity, with non-standard error handling.
-     * Will be removed once all uses have been switched to the new ensureCapacity().
-     */
-    UBool ensureCapacityX(int32_t minimumCapacity, UErrorCode &status);
-
     UBool ensureCapacity(int32_t minimumCapacity, UErrorCode &status);
 
     /**
diff --git a/thirdparty/icu4c/common/uvectr32.cpp b/thirdparty/icu4c/common/uvectr32.cpp
index a77ecb689f..2b4d0b8a75 100644
--- a/thirdparty/icu4c/common/uvectr32.cpp
+++ b/thirdparty/icu4c/common/uvectr32.cpp
@@ -83,7 +83,7 @@ void UVector32::assign(const UVector32& other, UErrorCode &ec) {
 }
 
 
-bool UVector32::operator==(const UVector32& other) {
+bool UVector32::operator==(const UVector32& other) const {
     int32_t i;
     if (count != other.count) return false;
     for (i=0; i<count; ++i) {
diff --git a/thirdparty/icu4c/common/uvectr32.h b/thirdparty/icu4c/common/uvectr32.h
index f08c2ade76..ecefb7af3e 100644
--- a/thirdparty/icu4c/common/uvectr32.h
+++ b/thirdparty/icu4c/common/uvectr32.h
@@ -86,12 +86,12 @@ public:
      * equal if they are of the same size and all elements are equal,
      * as compared using this object's comparer.
      */
-    bool operator==(const UVector32& other);
+    bool operator==(const UVector32& other) const;
 
     /**
      * Equivalent to !operator==()
      */
-    inline bool operator!=(const UVector32& other);
+    inline bool operator!=(const UVector32& other) const;
 
     //------------------------------------------------------------
     // java.util.Vector API
@@ -268,7 +268,7 @@ inline int32_t UVector32::lastElementi(void) const {
     return elementAti(count-1);
 }
 
-inline bool UVector32::operator!=(const UVector32& other) {
+inline bool UVector32::operator!=(const UVector32& other) const {
     return !operator==(other);
 }
author	Rémi Verschelde <remi@verschelde.fr>	2022-05-17 19:00:41 +0200
committer	GitHub <noreply@github.com>	2022-05-17 19:00:41 +0200
commit	abee814263c586d96b523b483bafb86d7a63aa58 (patch)
tree	2d29c9b6ebaec415bdce2100e2319fcb651d8bbe /thirdparty/icu4c/common
parent	7ea8cde9834b0fda4a928217b7880da3dd330214 (diff)
parent	93fba7ead33b45a6f9904ab6a69ada72e8564230 (diff)