summaryrefslogtreecommitdiff
path: root/thirdparty/icu4c/common
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/icu4c/common')
-rw-r--r--thirdparty/icu4c/common/brkeng.cpp1
-rw-r--r--thirdparty/icu4c/common/brkeng.h2
-rw-r--r--thirdparty/icu4c/common/brkiter.cpp31
-rw-r--r--thirdparty/icu4c/common/dictbe.cpp165
-rw-r--r--thirdparty/icu4c/common/dictbe.h27
-rw-r--r--thirdparty/icu4c/common/localematcher.cpp42
-rw-r--r--thirdparty/icu4c/common/locid.cpp18
-rw-r--r--thirdparty/icu4c/common/lstmbe.cpp3
-rw-r--r--thirdparty/icu4c/common/lstmbe.h1
-rw-r--r--thirdparty/icu4c/common/normalizer2impl.cpp11
-rw-r--r--thirdparty/icu4c/common/rbbi.cpp14
-rw-r--r--thirdparty/icu4c/common/rbbi_cache.cpp2
-rw-r--r--thirdparty/icu4c/common/serv.cpp5
-rw-r--r--thirdparty/icu4c/common/servls.cpp3
-rw-r--r--thirdparty/icu4c/common/servnotf.cpp18
-rw-r--r--thirdparty/icu4c/common/ubrk.cpp12
-rw-r--r--thirdparty/icu4c/common/ucase.cpp40
-rw-r--r--thirdparty/icu4c/common/ucase.h15
-rw-r--r--thirdparty/icu4c/common/ucasemap.cpp115
-rw-r--r--thirdparty/icu4c/common/ucnv.cpp11
-rw-r--r--thirdparty/icu4c/common/ucurr.cpp2
-rw-r--r--thirdparty/icu4c/common/uloc.cpp6
-rw-r--r--thirdparty/icu4c/common/unicode/localematcher.h6
-rw-r--r--thirdparty/icu4c/common/unicode/rbbi.h20
-rw-r--r--thirdparty/icu4c/common/unicode/ubrk.h11
-rw-r--r--thirdparty/icu4c/common/unicode/ucnv.h26
-rw-r--r--thirdparty/icu4c/common/unicode/uniset.h4
-rw-r--r--thirdparty/icu4c/common/unicode/urename.h5
-rw-r--r--thirdparty/icu4c/common/unicode/uset.h18
-rw-r--r--thirdparty/icu4c/common/unicode/uvernum.h10
-rw-r--r--thirdparty/icu4c/common/unistr.cpp8
-rw-r--r--thirdparty/icu4c/common/ustrcase.cpp130
-rw-r--r--thirdparty/icu4c/common/uvector.cpp43
-rw-r--r--thirdparty/icu4c/common/uvector.h12
-rw-r--r--thirdparty/icu4c/common/uvectr32.cpp2
-rw-r--r--thirdparty/icu4c/common/uvectr32.h6
36 files changed, 556 insertions, 289 deletions
diff --git a/thirdparty/icu4c/common/brkeng.cpp b/thirdparty/icu4c/common/brkeng.cpp
index 52e9c53621..dc9fb99bf1 100644
--- a/thirdparty/icu4c/common/brkeng.cpp
+++ b/thirdparty/icu4c/common/brkeng.cpp
@@ -79,6 +79,7 @@ UnhandledEngine::findBreaks( UText *text,
int32_t /* startPos */,
int32_t endPos,
UVector32 &/*foundBreaks*/,
+ UBool /* isPhraseBreaking */,
UErrorCode &status) const {
if (U_FAILURE(status)) return 0;
UChar32 c = utext_current32(text);
diff --git a/thirdparty/icu4c/common/brkeng.h b/thirdparty/icu4c/common/brkeng.h
index 6843f1cc95..127ba59e18 100644
--- a/thirdparty/icu4c/common/brkeng.h
+++ b/thirdparty/icu4c/common/brkeng.h
@@ -75,6 +75,7 @@ class LanguageBreakEngine : public UMemory {
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode &status) const = 0;
};
@@ -194,6 +195,7 @@ class UnhandledEngine : public LanguageBreakEngine {
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode &status) const override;
/**
diff --git a/thirdparty/icu4c/common/brkiter.cpp b/thirdparty/icu4c/common/brkiter.cpp
index 8b228acf2c..8a1915880e 100644
--- a/thirdparty/icu4c/common/brkiter.cpp
+++ b/thirdparty/icu4c/common/brkiter.cpp
@@ -30,6 +30,7 @@
#include "unicode/ures.h"
#include "unicode/ustring.h"
#include "unicode/filteredbrk.h"
+#include "bytesinkutil.h"
#include "ucln_cmn.h"
#include "cstring.h"
#include "umutex.h"
@@ -115,7 +116,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
}
// Create a RuleBasedBreakIterator
- result = new RuleBasedBreakIterator(file, status);
+ result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != NULL, status);
// If there is a result, set the valid locale and actual locale, and the kind
if (U_SUCCESS(status) && result != NULL) {
@@ -408,7 +409,6 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
if (U_FAILURE(status)) {
return NULL;
}
- char lbType[kKeyValueLenMax];
BreakIterator *result = NULL;
switch (kind) {
@@ -428,18 +428,29 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
break;
case UBRK_LINE:
{
+ char lb_lw[kKeyValueLenMax];
UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
- uprv_strcpy(lbType, "line");
- char lbKeyValue[kKeyValueLenMax] = {0};
+ uprv_strcpy(lb_lw, "line");
UErrorCode kvStatus = U_ZERO_ERROR;
- int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
- if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
- uprv_strcat(lbType, "_");
- uprv_strcat(lbType, lbKeyValue);
+ CharString value;
+ CharStringByteSink valueSink(&value);
+ loc.getKeywordValue("lb", valueSink, kvStatus);
+ if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
+ uprv_strcat(lb_lw, "_");
+ uprv_strcat(lb_lw, value.data());
}
- result = BreakIterator::buildInstance(loc, lbType, status);
+ // lw=phrase is only supported in Japanese.
+ if (uprv_strcmp(loc.getLanguage(), "ja") == 0) {
+ value.clear();
+ loc.getKeywordValue("lw", valueSink, kvStatus);
+ if (U_SUCCESS(kvStatus) && value == "phrase") {
+ uprv_strcat(lb_lw, "_");
+ uprv_strcat(lb_lw, value.data());
+ }
+ }
+ result = BreakIterator::buildInstance(loc, lb_lw, status);
- UTRACE_DATA1(UTRACE_INFO, "lb=%s", lbKeyValue);
+ UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
UTRACE_EXIT_STATUS(status);
}
break;
diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp
index 4d158e3226..4fdbdf2760 100644
--- a/thirdparty/icu4c/common/dictbe.cpp
+++ b/thirdparty/icu4c/common/dictbe.cpp
@@ -17,7 +17,10 @@
#include "dictbe.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
+#include "unicode/resbund.h"
#include "unicode/ubrk.h"
+#include "unicode/usetiter.h"
+#include "ubrkimpl.h"
#include "utracimp.h"
#include "uvectr32.h"
#include "uvector.h"
@@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
(void)startPos; // TODO: remove this param?
@@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
}
rangeStart = start;
rangeEnd = current;
- result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
+ result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
utext_setNativeIndex(text, current);
return result;
@@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
- fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
+ UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
- setCharacters(fThaiWordSet);
+ setCharacters(thaiWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
- fEndWordSet = fThaiWordSet;
+ fEndWordSet = thaiWordSet;
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
@@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
utext_setNativeIndex(text, rangeStart);
@@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
- fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
+ UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
- setCharacters(fLaoWordSet);
+ setCharacters(laoWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
- fEndWordSet = fLaoWordSet;
+ fEndWordSet = laoWordSet;
fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
@@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
@@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
- fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
+ fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
+ fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.add(0x0020);
if (U_SUCCESS(status)) {
- setCharacters(fBurmeseWordSet);
+ setCharacters(fEndWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
- fMarkSet.add(0x0020);
- fEndWordSet = fBurmeseWordSet;
- fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
// Compact for caching.
fMarkSet.compact();
@@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status ) const {
if (U_FAILURE(status)) return 0;
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
@@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
- fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
+ UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
- setCharacters(fKhmerWordSet);
+ setCharacters(khmerWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
- fEndWordSet = fKhmerWordSet;
+ fEndWordSet = khmerWordSet;
fBeginWordSet.add(0x1780, 0x17B3);
//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
@@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status ) const {
if (U_FAILURE(status)) return 0;
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
@@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
- // Korean dictionary only includes Hangul syllables
- fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
- fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
- fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
- fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
-
- if (U_SUCCESS(status)) {
- // handle Korean and Japanese/Chinese using different dictionaries
- if (type == kKorean) {
+ // Korean dictionary only includes Hangul syllables
+ fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
+ fHangulWordSet.compact();
+ // Digits, open puncutation and Alphabetic characters.
+ fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
+ UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
+ fDigitOrOpenPunctuationOrAlphabetSet.compact();
+ fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
+ fClosePunctuationSet.compact();
+
+ // handle Korean and Japanese/Chinese using different dictionaries
+ if (type == kKorean) {
+ if (U_SUCCESS(status)) {
setCharacters(fHangulWordSet);
- } else { //Chinese and Japanese
- UnicodeSet cjSet;
- cjSet.addAll(fHanWordSet);
- cjSet.addAll(fKatakanaWordSet);
- cjSet.addAll(fHiraganaWordSet);
- cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
- cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
+ }
+ } else { //Chinese and Japanese
+ UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
+ if (U_SUCCESS(status)) {
setCharacters(cjSet);
+ initJapanesePhraseParameter(status);
}
}
UTRACE_EXIT_STATUS(status);
@@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) {
(value >= 0xFF66 && value <= 0xFF9f);
}
-
// Function for accessing internal utext flags.
// Replicates an internal UText function.
static inline int32_t utext_i32_flag(int32_t bitIndex) {
return (int32_t)1 << bitIndex;
}
-
/*
* @param text A UText representing the text
@@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
if (rangeStart >= rangeEnd) {
@@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
t_boundary.addElement(numCodePts, status);
numBreaks++;
+ } else if (isPhraseBreaking) {
+ t_boundary.addElement(numCodePts, status);
+ if(U_SUCCESS(status)) {
+ numBreaks++;
+ int32_t prevIdx = numCodePts;
+
+ int32_t codeUnitIdx = -1;
+ int32_t prevCodeUnitIdx = -1;
+ int32_t length = -1;
+ for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
+ codeUnitIdx = inString.moveIndex32(0, i);
+ prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
+ // Calculate the length by using the code unit.
+ length = prevCodeUnitIdx - codeUnitIdx;
+ prevIdx = i;
+ // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
+ // characters don't occur.
+ if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
+ && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
+ || !isKatakana(inString.char32At(codeUnitIdx)))) {
+ t_boundary.addElement(i, status);
+ numBreaks++;
+ }
+ }
+ }
} else {
for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
t_boundary.addElement(i, status);
@@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
// while reversing t_boundary and pushing values to foundBreaks.
int32_t prevCPPos = -1;
int32_t prevUTextPos = -1;
- for (int32_t i = numBreaks-1; i >= 0; i--) {
+ int32_t correctedNumBreaks = 0;
+ for (int32_t i = numBreaks - 1; i >= 0; i--) {
int32_t cpPos = t_boundary.elementAti(i);
U_ASSERT(cpPos > prevCPPos);
int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
@@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
if (utextPos > prevUTextPos) {
// Boundaries are added to foundBreaks output in ascending order.
U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
- foundBreaks.push(utextPos, status);
+ // In phrase breaking, there has to be a breakpoint between Cj character and close
+ // punctuation.
+ // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
+ if (utextPos != rangeStart
+ || (isPhraseBreaking && utextPos > 0
+ && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
+ foundBreaks.push(utextPos, status);
+ correctedNumBreaks++;
+ }
} else {
// Normalization expanded the input text, the dictionary found a boundary
// within the expansion, giving two boundaries with the same index in the
@@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
}
(void)prevCPPos; // suppress compiler warnings about unused variable
+ UChar32 nextChar = utext_char32At(inText, rangeEnd);
+ if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+ // In phrase breaking, there has to be a breakpoint between Cj character and
+ // the number/open punctuation.
+ // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+ // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
+ // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
+ if (isPhraseBreaking) {
+ if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
+ foundBreaks.popi();
+ correctedNumBreaks--;
+ }
+ } else {
+ foundBreaks.popi();
+ correctedNumBreaks--;
+ }
+ }
+
// inString goes out of scope
// inputMap goes out of scope
- return numBreaks;
+ return correctedNumBreaks;
+}
+
+void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
+ loadJapaneseExtensions(error);
+ loadHiragana(error);
+}
+
+void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
+ const char* tag = "extensions";
+ ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
+ if (U_SUCCESS(error)) {
+ ResourceBundle bundle = ja.get(tag, error);
+ while (U_SUCCESS(error) && bundle.hasNext()) {
+ fSkipSet.puti(bundle.getNextString(error), 1, error);
+ }
+ }
+}
+
+void CjkBreakEngine::loadHiragana(UErrorCode& error) {
+ UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);
+ hiraganaWordSet.compact();
+ UnicodeSetIterator iterator(hiraganaWordSet);
+ while (iterator.next()) {
+ fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);
+ }
}
#endif
diff --git a/thirdparty/icu4c/common/dictbe.h b/thirdparty/icu4c/common/dictbe.h
index 4e70ed3817..ca1a3c28b7 100644
--- a/thirdparty/icu4c/common/dictbe.h
+++ b/thirdparty/icu4c/common/dictbe.h
@@ -15,6 +15,7 @@
#include "unicode/utext.h"
#include "brkeng.h"
+#include "hash.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
@@ -80,6 +81,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status ) const override;
protected:
@@ -105,6 +107,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const = 0;
};
@@ -127,7 +130,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
* @internal
*/
- UnicodeSet fThaiWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fSuffixSet;
@@ -164,6 +166,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const override;
};
@@ -186,7 +189,6 @@ class LaoBreakEngine : public DictionaryBreakEngine {
* @internal
*/
- UnicodeSet fLaoWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
@@ -222,6 +224,7 @@ class LaoBreakEngine : public DictionaryBreakEngine {
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const override;
};
@@ -244,7 +247,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
* @internal
*/
- UnicodeSet fBurmeseWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
@@ -280,6 +282,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const override;
};
@@ -302,7 +305,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
* @internal
*/
- UnicodeSet fKhmerWordSet;
UnicodeSet fEndWordSet;
UnicodeSet fBeginWordSet;
UnicodeSet fMarkSet;
@@ -338,6 +340,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const override;
};
@@ -366,13 +369,22 @@ class CjkBreakEngine : public DictionaryBreakEngine {
* @internal
*/
UnicodeSet fHangulWordSet;
- UnicodeSet fHanWordSet;
- UnicodeSet fKatakanaWordSet;
- UnicodeSet fHiraganaWordSet;
+ UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
+ UnicodeSet fClosePunctuationSet;
DictionaryMatcher *fDictionary;
const Normalizer2 *nfkcNorm2;
+ private:
+ // Load Japanese extensions.
+ void loadJapaneseExtensions(UErrorCode& error);
+ // Load Japanese Hiragana.
+ void loadHiragana(UErrorCode& error);
+ // Initialize fSkipSet by loading Japanese Hiragana and extensions.
+ void initJapanesePhraseParameter(UErrorCode& error);
+
+ Hashtable fSkipSet;
+
public:
/**
@@ -404,6 +416,7 @@ class CjkBreakEngine : public DictionaryBreakEngine {
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const override;
};
diff --git a/thirdparty/icu4c/common/localematcher.cpp b/thirdparty/icu4c/common/localematcher.cpp
index 3d178dfbaf..2cad708d99 100644
--- a/thirdparty/icu4c/common/localematcher.cpp
+++ b/thirdparty/icu4c/common/localematcher.cpp
@@ -168,12 +168,9 @@ void LocaleMatcher::Builder::clearSupportedLocales() {
bool LocaleMatcher::Builder::ensureSupportedLocaleVector() {
if (U_FAILURE(errorCode_)) { return false; }
if (supportedLocales_ != nullptr) { return true; }
- supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_);
+ LocalPointer<UVector> lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_);
if (U_FAILURE(errorCode_)) { return false; }
- if (supportedLocales_ == nullptr) {
- errorCode_ = U_MEMORY_ALLOCATION_ERROR;
- return false;
- }
+ supportedLocales_ = lpSupportedLocales.orphan();
return true;
}
@@ -187,9 +184,8 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin
for (int32_t i = 0; i < length; ++i) {
Locale *locale = list.orphanLocaleAt(i);
if (locale == nullptr) { continue; }
- supportedLocales_->addElementX(locale, errorCode_);
+ supportedLocales_->adoptElement(locale, errorCode_);
if (U_FAILURE(errorCode_)) {
- delete locale;
break;
}
}
@@ -197,35 +193,21 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin
}
LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) {
- if (U_FAILURE(errorCode_)) { return *this; }
- clearSupportedLocales();
- if (!ensureSupportedLocaleVector()) { return *this; }
- while (locales.hasNext()) {
- const Locale &locale = locales.next();
- Locale *clone = locale.clone();
- if (clone == nullptr) {
- errorCode_ = U_MEMORY_ALLOCATION_ERROR;
- break;
- }
- supportedLocales_->addElementX(clone, errorCode_);
- if (U_FAILURE(errorCode_)) {
- delete clone;
- break;
+ if (ensureSupportedLocaleVector()) {
+ clearSupportedLocales();
+ while (locales.hasNext() && U_SUCCESS(errorCode_)) {
+ const Locale &locale = locales.next();
+ LocalPointer<Locale> clone (locale.clone(), errorCode_);
+ supportedLocales_->adoptElement(clone.orphan(), errorCode_);
}
}
return *this;
}
LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) {
- if (!ensureSupportedLocaleVector()) { return *this; }
- Locale *clone = locale.clone();
- if (clone == nullptr) {
- errorCode_ = U_MEMORY_ALLOCATION_ERROR;
- return *this;
- }
- supportedLocales_->addElementX(clone, errorCode_);
- if (U_FAILURE(errorCode_)) {
- delete clone;
+ if (ensureSupportedLocaleVector()) {
+ LocalPointer<Locale> clone(locale.clone(), errorCode_);
+ supportedLocales_->adoptElement(clone.orphan(), errorCode_);
}
return *this;
}
diff --git a/thirdparty/icu4c/common/locid.cpp b/thirdparty/icu4c/common/locid.cpp
index e8859c7048..73bb8d8aec 100644
--- a/thirdparty/icu4c/common/locid.cpp
+++ b/thirdparty/icu4c/common/locid.cpp
@@ -1204,14 +1204,11 @@ AliasReplacer::parseLanguageReplacement(
// We have multiple field so we have to allocate and parse
CharString* str = new CharString(
replacement, (int32_t)uprv_strlen(replacement), status);
+ LocalPointer<CharString> lpStr(str, status);
+ toBeFreed.adoptElement(lpStr.orphan(), status);
if (U_FAILURE(status)) {
return;
}
- if (str == nullptr) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- toBeFreed.addElementX(str, status);
char* data = str->data();
replacedLanguage = (const char*) data;
char* endOfField = uprv_strchr(data, '_');
@@ -1420,12 +1417,9 @@ AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status)
(int32_t)(firstSpace - replacement), status), status);
}
if (U_FAILURE(status)) { return false; }
- if (item.isNull()) {
- status = U_MEMORY_ALLOCATION_ERROR;
- return false;
- }
replacedRegion = item->data();
- toBeFreed.addElementX(item.orphan(), status);
+ toBeFreed.adoptElement(item.orphan(), status);
+ if (U_FAILURE(status)) { return false; }
}
U_ASSERT(!same(region, replacedRegion));
region = replacedRegion;
@@ -1659,10 +1653,10 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr &&
U_SUCCESS(status)) {
*end = NULL_CHAR; // null terminate inside variantsBuff
- variants.addElementX(start, status);
+ variants.addElement(start, status);
start = end + 1;
}
- variants.addElementX(start, status);
+ variants.addElement(start, status);
}
if (U_FAILURE(status)) { return false; }
diff --git a/thirdparty/icu4c/common/lstmbe.cpp b/thirdparty/icu4c/common/lstmbe.cpp
index 3793abceb3..f6114cdfe2 100644
--- a/thirdparty/icu4c/common/lstmbe.cpp
+++ b/thirdparty/icu4c/common/lstmbe.cpp
@@ -1,8 +1,8 @@
// © 2021 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
+#include <complex>
#include <utility>
-#include <ctgmath>
#include "unicode/utypes.h"
@@ -639,6 +639,7 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
int32_t beginFoundBreakSize = foundBreaks.size();
diff --git a/thirdparty/icu4c/common/lstmbe.h b/thirdparty/icu4c/common/lstmbe.h
index c3f7ecf815..ffdf805eca 100644
--- a/thirdparty/icu4c/common/lstmbe.h
+++ b/thirdparty/icu4c/common/lstmbe.h
@@ -62,6 +62,7 @@ protected:
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const override;
private:
const LSTMData* fData;
diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp
index 5bfd49e8cb..e6bd75e717 100644
--- a/thirdparty/icu4c/common/normalizer2impl.cpp
+++ b/thirdparty/icu4c/common/normalizer2impl.cpp
@@ -2496,15 +2496,18 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode
// origin is not the first character, or it is U+0000.
UnicodeSet *set;
if((canonValue&CANON_HAS_SET)==0) {
- set=new UnicodeSet;
- if(set==NULL) {
- errorCode=U_MEMORY_ALLOCATION_ERROR;
+ LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode);
+ set=lpSet.getAlias();
+ if(U_FAILURE(errorCode)) {
return;
}
UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
- canonStartSets.addElementX(set, errorCode);
+ canonStartSets.adoptElement(lpSet.orphan(), errorCode);
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
if(firstOrigin!=0) {
set->add(firstOrigin);
}
diff --git a/thirdparty/icu4c/common/rbbi.cpp b/thirdparty/icu4c/common/rbbi.cpp
index f65177f232..cae8d154b3 100644
--- a/thirdparty/icu4c/common/rbbi.cpp
+++ b/thirdparty/icu4c/common/rbbi.cpp
@@ -82,6 +82,19 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
}
}
+//-------------------------------------------------------------------------------
+//
+// Constructor from a UDataMemory handle to precompiled break rules
+// stored in an ICU data file. This construcotr is private API,
+// only for internal use.
+//
+//-------------------------------------------------------------------------------
+RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
+ UErrorCode &status) : RuleBasedBreakIterator(udm, status)
+{
+ fIsPhraseBreaking = isPhraseBreaking;
+}
+
//
// Construct from precompiled binary rules (tables). This constructor is public API,
// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
@@ -322,6 +335,7 @@ void RuleBasedBreakIterator::init(UErrorCode &status) {
fBreakCache = nullptr;
fDictionaryCache = nullptr;
fLookAheadMatches = nullptr;
+ fIsPhraseBreaking = false;
// Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
// fText = UTEXT_INITIALIZER;
diff --git a/thirdparty/icu4c/common/rbbi_cache.cpp b/thirdparty/icu4c/common/rbbi_cache.cpp
index 6bfe3feca4..26d82df781 100644
--- a/thirdparty/icu4c/common/rbbi_cache.cpp
+++ b/thirdparty/icu4c/common/rbbi_cache.cpp
@@ -163,7 +163,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != NULL) {
- foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, status);
+ foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
}
// Reload the loop variables for the next go-round
diff --git a/thirdparty/icu4c/common/serv.cpp b/thirdparty/icu4c/common/serv.cpp
index 0c54a4dce9..c26dbca1a9 100644
--- a/thirdparty/icu4c/common/serv.cpp
+++ b/thirdparty/icu4c/common/serv.cpp
@@ -625,10 +625,7 @@ ICUService::getVisibleIDs(UVector& result, const UnicodeString* matchID, UErrorC
}
}
- LocalPointer<UnicodeString> idClone(new UnicodeString(*id), status);
- if (U_SUCCESS(status) && idClone->isBogus()) {
- status = U_MEMORY_ALLOCATION_ERROR;
- }
+ LocalPointer<UnicodeString> idClone(id->clone(), status);
result.adoptElement(idClone.orphan(), status);
}
delete fallbackKey;
diff --git a/thirdparty/icu4c/common/servls.cpp b/thirdparty/icu4c/common/servls.cpp
index 7108afd4a5..98f0a8a12b 100644
--- a/thirdparty/icu4c/common/servls.cpp
+++ b/thirdparty/icu4c/common/servls.cpp
@@ -179,7 +179,8 @@ private:
length = other._ids.size();
for(i = 0; i < length; ++i) {
- _ids.addElementX(((UnicodeString *)other._ids.elementAt(i))->clone(), status);
+ LocalPointer<UnicodeString> clonedId(((UnicodeString *)other._ids.elementAt(i))->clone(), status);
+ _ids.adoptElement(clonedId.orphan(), status);
}
if(U_SUCCESS(status)) {
diff --git a/thirdparty/icu4c/common/servnotf.cpp b/thirdparty/icu4c/common/servnotf.cpp
index 342e0d9f24..d9fb388752 100644
--- a/thirdparty/icu4c/common/servnotf.cpp
+++ b/thirdparty/icu4c/common/servnotf.cpp
@@ -49,7 +49,11 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status)
if (acceptsListener(*l)) {
Mutex lmx(&notifyLock);
if (listeners == NULL) {
- listeners = new UVector(5, status);
+ LocalPointer<UVector> lpListeners(new UVector(5, status), status);
+ if (U_FAILURE(status)) {
+ return;
+ }
+ listeners = lpListeners.orphan();
} else {
for (int i = 0, e = listeners->size(); i < e; ++i) {
const EventListener* el = (const EventListener*)(listeners->elementAt(i));
@@ -59,7 +63,7 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status)
}
}
- listeners->addElementX((void*)l, status); // cast away const
+ listeners->addElement((void*)l, status); // cast away const
}
#ifdef NOTIFIER_DEBUG
else {
@@ -102,13 +106,11 @@ ICUNotifier::removeListener(const EventListener *l, UErrorCode& status)
void
ICUNotifier::notifyChanged(void)
{
+ Mutex lmx(&notifyLock);
if (listeners != NULL) {
- Mutex lmx(&notifyLock);
- if (listeners != NULL) {
- for (int i = 0, e = listeners->size(); i < e; ++i) {
- EventListener* el = (EventListener*)listeners->elementAt(i);
- notifyListener(*el);
- }
+ for (int i = 0, e = listeners->size(); i < e; ++i) {
+ EventListener* el = (EventListener*)listeners->elementAt(i);
+ notifyListener(*el);
}
}
}
diff --git a/thirdparty/icu4c/common/ubrk.cpp b/thirdparty/icu4c/common/ubrk.cpp
index bb5bdd1b50..f4e064961f 100644
--- a/thirdparty/icu4c/common/ubrk.cpp
+++ b/thirdparty/icu4c/common/ubrk.cpp
@@ -168,7 +168,7 @@ ubrk_safeClone(
BreakIterator *newBI = ((BreakIterator *)bi)->clone();
if (newBI == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
- } else {
+ } else if (pBufferSize != NULL) {
*status = U_SAFECLONE_ALLOCATED_WARNING;
}
return (UBreakIterator *)newBI;
@@ -176,15 +176,7 @@ ubrk_safeClone(
U_CAPI UBreakIterator * U_EXPORT2
ubrk_clone(const UBreakIterator *bi, UErrorCode *status) {
- if (U_FAILURE(*status)) {
- return nullptr;
- }
- BreakIterator *newBI = ((BreakIterator *)bi)->clone();
- if (newBI == nullptr) {
- *status = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- return (UBreakIterator *)newBI;
+ return ubrk_safeClone(bi, nullptr, nullptr, status);
}
diff --git a/thirdparty/icu4c/common/ucase.cpp b/thirdparty/icu4c/common/ucase.cpp
index 4aa856507a..388c86b1bb 100644
--- a/thirdparty/icu4c/common/ucase.cpp
+++ b/thirdparty/icu4c/common/ucase.cpp
@@ -22,27 +22,14 @@
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/uset.h"
-#include "unicode/udata.h" /* UDataInfo */
#include "unicode/utf16.h"
-#include "ucmndata.h" /* DataHeader */
-#include "udatamem.h"
-#include "umutex.h"
-#include "uassert.h"
#include "cmemory.h"
-#include "utrie2.h"
+#include "uassert.h"
#include "ucase.h"
+#include "umutex.h"
+#include "utrie2.h"
-struct UCaseProps {
- UDataMemory *mem;
- const int32_t *indexes;
- const uint16_t *exceptions;
- const uint16_t *unfold;
-
- UTrie2 trie;
- uint8_t formatVersion[4];
-};
-
-/* ucase_props_data.h is machine-generated by gencase --csource */
+/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
#define INCLUDED_FROM_UCASE_CPP
#include "ucase_props_data.h"
@@ -77,6 +64,13 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
/* data access primitives --------------------------------------------------- */
+U_CAPI const struct UCaseProps * U_EXPORT2
+ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
+ *pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
+ *pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
+ return &ucase_props_singleton;
+}
+
U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie() {
return &ucase_props_singleton.trie;
@@ -690,7 +684,7 @@ ucase_isCaseSensitive(UChar32 c) {
* - The general category of C is
* Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
* Letter Modifier (Lm), or Symbol Modifier (Sk)
- * - C is one of the following characters
+ * - C is one of the following characters
* U+0027 APOSTROPHE
* U+00AD SOFT HYPHEN (SHY)
* U+2019 RIGHT SINGLE QUOTATION MARK
@@ -1064,6 +1058,8 @@ ucase_toFullLower(UChar32 c,
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
U_ASSERT(c >= 0);
UChar32 result=c;
+ // Reset the output pointer in case it was uninitialized.
+ *pString=nullptr;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_IS_UPPER_OR_TITLE(props)) {
@@ -1148,7 +1144,6 @@ ucase_toFullLower(UChar32 c,
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
*/
- *pString=nullptr;
return 0; /* remove the dot (continue without output) */
} else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
/*
@@ -1215,6 +1210,8 @@ toUpperOrTitle(UChar32 c,
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
U_ASSERT(c >= 0);
UChar32 result=c;
+ // Reset the output pointer in case it was uninitialized.
+ *pString=nullptr;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
@@ -1252,7 +1249,6 @@ toUpperOrTitle(UChar32 c,
0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
*/
- *pString=nullptr;
return 0; /* remove the dot (continue without output) */
} else if(c==0x0587) {
// See ICU-13416:
@@ -1449,6 +1445,8 @@ ucase_toFullFolding(UChar32 c,
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
U_ASSERT(c >= 0);
UChar32 result=c;
+ // Reset the output pointer in case it was uninitialized.
+ *pString=nullptr;
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
if(!UCASE_HAS_EXCEPTION(props)) {
if(UCASE_IS_UPPER_OR_TITLE(props)) {
@@ -1542,7 +1540,7 @@ U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c) {
return ucase_tolower(c);
}
-
+
/* Transforms the Unicode character to its upper case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c) {
diff --git a/thirdparty/icu4c/common/ucase.h b/thirdparty/icu4c/common/ucase.h
index a018f82b81..7bf57fd370 100644
--- a/thirdparty/icu4c/common/ucase.h
+++ b/thirdparty/icu4c/common/ucase.h
@@ -312,6 +312,21 @@ UCaseMapFull(UChar32 c,
U_CDECL_END
+/* for icuexportdata -------------------------------------------------------- */
+
+struct UCaseProps {
+ void *mem; // TODO: was unused, and type UDataMemory -- remove
+ const int32_t *indexes;
+ const uint16_t *exceptions;
+ const uint16_t *unfold;
+
+ UTrie2 trie;
+ uint8_t formatVersion[4];
+};
+
+U_CAPI const struct UCaseProps * U_EXPORT2
+ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength);
+
/* file definitions --------------------------------------------------------- */
#define UCASE_DATA_NAME "ucase"
diff --git a/thirdparty/icu4c/common/ucasemap.cpp b/thirdparty/icu4c/common/ucasemap.cpp
index ed72bda828..95b55d56a0 100644
--- a/thirdparty/icu4c/common/ucasemap.cpp
+++ b/thirdparty/icu4c/common/ucasemap.cpp
@@ -112,8 +112,7 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
if(length==sizeof(csm->locale)) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
- if(U_SUCCESS(*pErrorCode)) {
- csm->caseLocale=UCASE_LOC_UNKNOWN;
+ if(U_SUCCESS(*pErrorCode)) {
csm->caseLocale = ucase_getCaseLocale(csm->locale);
} else {
csm->locale[0]=0;
@@ -420,6 +419,97 @@ void toUpper(int32_t caseLocale, uint32_t options,
#if !UCONFIG_NO_BREAK_ITERATION
+namespace {
+
+constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
+
+constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
+ ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
+ U_ASSERT(start < segmentLimit);
+
+ int32_t index = start;
+ bool withAcute = false;
+
+ // If the conditions are met, then the following variables tell us what to output.
+ int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
+ bool doTitleJ = false; // true if the j needs to be titlecased
+ int32_t unchanged2 = 0; // after the j (0 or 1)
+
+ // next character after the first letter
+ UChar32 c2;
+ c2 = src[index++];
+
+ // Is the first letter an i/I with accent?
+ if (c == u'I') {
+ if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
+ withAcute = true;
+ unchanged1 = 2; // ACUTE is 2 code units in UTF-8
+ if (index == segmentLimit) { return start; }
+ c2 = src[index++];
+ }
+ } else { // Í
+ withAcute = true;
+ }
+
+ // Is the next character a j/J?
+ if (c2 == u'j') {
+ doTitleJ = true;
+ } else if (c2 == u'J') {
+ ++unchanged1;
+ } else {
+ return start;
+ }
+
+ // A plain i/I must be followed by a plain j/J.
+ // An i/I with acute must be followed by a j/J with acute.
+ if (withAcute) {
+ if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
+ return start;
+ }
+ if (doTitleJ) {
+ unchanged2 = 2; // ACUTE is 2 code units in UTF-8
+ } else {
+ unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
+ }
+ }
+
+ // There must not be another combining mark.
+ if (index < segmentLimit) {
+ int32_t cp;
+ int32_t i = index;
+ U8_NEXT(src, i, segmentLimit, cp);
+ uint32_t typeMask = U_GET_GC_MASK(cp);
+ if ((typeMask & U_GC_M_MASK) != 0) {
+ return start;
+ }
+ }
+
+ // Output the rest of the Dutch IJ.
+ ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
+ start += unchanged1;
+ if (doTitleJ) {
+ ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
+ ++start;
+ }
+ ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
+
+ U_ASSERT(start + unchanged2 == index);
+ return index;
+}
+
+} // namespace
+
U_CFUNC void U_CALLCONV
ucasemap_internalUTF8ToTitle(
int32_t caseLocale, uint32_t options, BreakIterator *iter,
@@ -504,19 +594,14 @@ ucasemap_internalUTF8ToTitle(
}
/* Special case Dutch IJ titlecasing */
- if (titleStart+1 < index &&
- caseLocale == UCASE_LOC_DUTCH &&
- (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
- if (src[titleStart+1] == 0x006A) {
- ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
- titleLimit++;
- } else if (src[titleStart+1] == 0x004A) {
- // Keep the capital J from getting lowercased.
- if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
- sink, options, edits, errorCode)) {
- return;
- }
- titleLimit++;
+ if (titleLimit < index &&
+ caseLocale == UCASE_LOC_DUTCH) {
+ if (c < 0) {
+ c = ~c;
+ }
+
+ if (c == u'I' || c == u'Í') {
+ titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
}
}
diff --git a/thirdparty/icu4c/common/ucnv.cpp b/thirdparty/icu4c/common/ucnv.cpp
index 5dcf35e043..019bcb6a79 100644
--- a/thirdparty/icu4c/common/ucnv.cpp
+++ b/thirdparty/icu4c/common/ucnv.cpp
@@ -252,7 +252,10 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
UTRACE_EXIT_STATUS(*status);
return NULL;
}
- *status = U_SAFECLONE_ALLOCATED_WARNING;
+ // If pBufferSize was NULL as the input, pBufferSize is set to &stackBufferSize in this function.
+ if (pBufferSize != &stackBufferSize) {
+ *status = U_SAFECLONE_ALLOCATED_WARNING;
+ }
/* record the fact that memory was allocated */
*pBufferSize = bufferSizeNeeded;
@@ -317,7 +320,11 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
return localConverter;
}
-
+U_CAPI UConverter* U_EXPORT2
+ucnv_clone(const UConverter* cnv, UErrorCode *status)
+{
+ return ucnv_safeClone(cnv, nullptr, nullptr, status);
+}
/*Decreases the reference counter in the shared immutable section of the object
*and frees the mutable part*/
diff --git a/thirdparty/icu4c/common/ucurr.cpp b/thirdparty/icu4c/common/ucurr.cpp
index 67aab4e8ff..6e489e0563 100644
--- a/thirdparty/icu4c/common/ucurr.cpp
+++ b/thirdparty/icu4c/common/ucurr.cpp
@@ -254,7 +254,7 @@ currSymbolsEquiv_cleanup(void)
}
/**
- * Deleter for OlsonToMetaMappingEntry
+ * Deleter for IsoCodeEntry
*/
static void U_CALLCONV
deleteIsoCodeEntry(void *obj) {
diff --git a/thirdparty/icu4c/common/uloc.cpp b/thirdparty/icu4c/common/uloc.cpp
index c8a3f1ff73..99c6a0af39 100644
--- a/thirdparty/icu4c/common/uloc.cpp
+++ b/thirdparty/icu4c/common/uloc.cpp
@@ -186,10 +186,10 @@ NULL
};
static const char* const DEPRECATED_LANGUAGES[]={
- "in", "iw", "ji", "jw", NULL, NULL
+ "in", "iw", "ji", "jw", "mo", NULL, NULL
};
static const char* const REPLACEMENT_LANGUAGES[]={
- "id", "he", "yi", "jv", NULL, NULL
+ "id", "he", "yi", "jv", "ro", NULL, NULL
};
/**
@@ -444,7 +444,7 @@ static const char * const COUNTRIES_3[] = {
/* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
"VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
/* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
- "WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
+ "WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
NULL,
/* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
"ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
diff --git a/thirdparty/icu4c/common/unicode/localematcher.h b/thirdparty/icu4c/common/unicode/localematcher.h
index 252bb7fdc2..0f7e04a3af 100644
--- a/thirdparty/icu4c/common/unicode/localematcher.h
+++ b/thirdparty/icu4c/common/unicode/localematcher.h
@@ -461,13 +461,13 @@ public:
* Option for whether to include or ignore one-way (fallback) match data.
* By default, they are included.
*
- * @param direction the match direction to set.
+ * @param matchDirection the match direction to set.
* @return this Builder object
* @stable ICU 67
*/
- Builder &setDirection(ULocMatchDirection direction) {
+ Builder &setDirection(ULocMatchDirection matchDirection) {
if (U_SUCCESS(errorCode_)) {
- direction_ = direction;
+ direction_ = matchDirection;
}
return *this;
}
diff --git a/thirdparty/icu4c/common/unicode/rbbi.h b/thirdparty/icu4c/common/unicode/rbbi.h
index 0ce93819f5..0bad0d3897 100644
--- a/thirdparty/icu4c/common/unicode/rbbi.h
+++ b/thirdparty/icu4c/common/unicode/rbbi.h
@@ -147,6 +147,11 @@ private:
*/
int32_t *fLookAheadMatches;
+ /**
+ * A flag to indicate if phrase based breaking is enabled.
+ */
+ UBool fIsPhraseBreaking;
+
//=======================================================================
// constructors
//=======================================================================
@@ -163,6 +168,21 @@ private:
*/
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
+ /**
+ * This constructor uses the udata interface to create a BreakIterator
+ * whose internal tables live in a memory-mapped file. "image" is an
+ * ICU UDataMemory handle for the pre-compiled break iterator tables.
+ * @param image handle to the memory image for the break iterator data.
+ * Ownership of the UDataMemory handle passes to the Break Iterator,
+ * which will be responsible for closing it when it is no longer needed.
+ * @param status Information on any errors encountered.
+ * @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
+ * @see udata_open
+ * @see #getBinaryRules
+ * @internal (private)
+ */
+ RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
+
/** @internal */
friend class RBBIRuleBuilder;
/** @internal */
diff --git a/thirdparty/icu4c/common/unicode/ubrk.h b/thirdparty/icu4c/common/unicode/ubrk.h
index c603f7c13f..2b3dc7aa57 100644
--- a/thirdparty/icu4c/common/unicode/ubrk.h
+++ b/thirdparty/icu4c/common/unicode/ubrk.h
@@ -312,11 +312,12 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
* If *pBufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
- * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
+ * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used
+ * if pBufferSize != NULL and any allocations were necessary
* @return pointer to the new clone
* @deprecated ICU 69 Use ubrk_clone() instead.
*/
-U_CAPI UBreakIterator * U_EXPORT2
+U_DEPRECATED UBreakIterator * U_EXPORT2
ubrk_safeClone(
const UBreakIterator *bi,
void *stackBuffer,
@@ -325,21 +326,17 @@ ubrk_safeClone(
#endif /* U_HIDE_DEPRECATED_API */
-#ifndef U_HIDE_DRAFT_API
-
/**
* Thread safe cloning operation.
* @param bi iterator to be cloned
* @param status to indicate whether the operation went on smoothly or there were errors
* @return pointer to the new clone
- * @draft ICU 69
+ * @stable ICU 69
*/
U_CAPI UBreakIterator * U_EXPORT2
ubrk_clone(const UBreakIterator *bi,
UErrorCode *status);
-#endif // U_HIDE_DRAFT_API
-
#ifndef U_HIDE_DEPRECATED_API
/**
diff --git a/thirdparty/icu4c/common/unicode/ucnv.h b/thirdparty/icu4c/common/unicode/ucnv.h
index 2687c984d4..20c173b662 100644
--- a/thirdparty/icu4c/common/unicode/ucnv.h
+++ b/thirdparty/icu4c/common/unicode/ucnv.h
@@ -477,7 +477,7 @@ ucnv_openCCSID(int32_t codepage,
*
* <p>The name will NOT be looked up in the alias mechanism, nor will the converter be
* stored in the converter cache or the alias table. The only way to open further converters
- * is call this function multiple times, or use the ucnv_safeClone() function to clone a
+ * is call this function multiple times, or use the ucnv_clone() function to clone a
* 'primary' converter.</p>
*
* <p>A future version of ICU may add alias table lookups and/or caching
@@ -493,7 +493,7 @@ ucnv_openCCSID(int32_t codepage,
* @return the created Unicode converter object, or <TT>NULL</TT> if an error occurred
* @see udata_open
* @see ucnv_open
- * @see ucnv_safeClone
+ * @see ucnv_clone
* @see ucnv_close
* @stable ICU 2.2
*/
@@ -502,6 +502,20 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode
/**
* Thread safe converter cloning operation.
+ *
+ * You must ucnv_close() the clone.
+ *
+ * @param cnv converter to be cloned
+ * @param status to indicate whether the operation went on smoothly or there were errors
+ * @return pointer to the new clone
+ * @stable ICU 71
+ */
+U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *status);
+
+#ifndef U_HIDE_DEPRECATED_API
+
+/**
+ * Thread safe converter cloning operation.
* For most efficient operation, pass in a stackBuffer (and a *pBufferSize)
* with at least U_CNV_SAFECLONE_BUFFERSIZE bytes of space.
* If the buffer size is sufficient, then the clone will use the stack buffer;
@@ -532,21 +546,19 @@ ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode
* pointer to size of allocated space.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_WARNING,
- * is used if any allocations were necessary.
+ * is used if pBufferSize != NULL and any allocations were necessary
* However, it is better to check if *pBufferSize grew for checking for
* allocations because warning codes can be overridden by subsequent
* function calls.
* @return pointer to the new clone
- * @stable ICU 2.0
+ * @deprecated ICU 71 Use ucnv_clone() instead.
*/
-U_CAPI UConverter * U_EXPORT2
+U_DEPRECATED UConverter * U_EXPORT2
ucnv_safeClone(const UConverter *cnv,
void *stackBuffer,
int32_t *pBufferSize,
UErrorCode *status);
-#ifndef U_HIDE_DEPRECATED_API
-
/**
* \def U_CNV_SAFECLONE_BUFFERSIZE
* Definition of a buffer size that is designed to be large enough for
diff --git a/thirdparty/icu4c/common/unicode/uniset.h b/thirdparty/icu4c/common/unicode/uniset.h
index 730337a353..310c7c8d20 100644
--- a/thirdparty/icu4c/common/unicode/uniset.h
+++ b/thirdparty/icu4c/common/unicode/uniset.h
@@ -1229,7 +1229,6 @@ public:
*/
UnicodeSet& retain(UChar32 c);
-#ifndef U_HIDE_DRAFT_API
/**
* Retains only the specified string from this set if it is present.
* Upon return this set will be empty if it did not contain s, or
@@ -1238,10 +1237,9 @@ public:
*
* @param s the source string
* @return this object, for chaining
- * @draft ICU 69
+ * @stable ICU 69
*/
UnicodeSet& retain(const UnicodeString &s);
-#endif // U_HIDE_DRAFT_API
/**
* Removes the specified range from this set if it is present.
diff --git a/thirdparty/icu4c/common/unicode/urename.h b/thirdparty/icu4c/common/unicode/urename.h
index 4605f632ea..d9f9b8f336 100644
--- a/thirdparty/icu4c/common/unicode/urename.h
+++ b/thirdparty/icu4c/common/unicode/urename.h
@@ -567,6 +567,7 @@
#define ucase_addStringCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addStringCaseClosure)
#define ucase_fold U_ICU_ENTRY_POINT_RENAME(ucase_fold)
#define ucase_getCaseLocale U_ICU_ENTRY_POINT_RENAME(ucase_getCaseLocale)
+#define ucase_getSingleton U_ICU_ENTRY_POINT_RENAME(ucase_getSingleton)
#define ucase_getTrie U_ICU_ENTRY_POINT_RENAME(ucase_getTrie)
#define ucase_getType U_ICU_ENTRY_POINT_RENAME(ucase_getType)
#define ucase_getTypeOrIgnorable U_ICU_ENTRY_POINT_RENAME(ucase_getTypeOrIgnorable)
@@ -630,6 +631,7 @@
#define ucnv_cbFromUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteUChars)
#define ucnv_cbToUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteSub)
#define ucnv_cbToUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteUChars)
+#define ucnv_clone U_ICU_ENTRY_POINT_RENAME(ucnv_clone)
#define ucnv_close U_ICU_ENTRY_POINT_RENAME(ucnv_close)
#define ucnv_compareNames U_ICU_ENTRY_POINT_RENAME(ucnv_compareNames)
#define ucnv_convert U_ICU_ENTRY_POINT_RENAME(ucnv_convert)
@@ -725,6 +727,7 @@
#define ucnvsel_selectForString U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForString)
#define ucnvsel_selectForUTF8 U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForUTF8)
#define ucnvsel_serialize U_ICU_ENTRY_POINT_RENAME(ucnvsel_serialize)
+#define ucol_clone U_ICU_ENTRY_POINT_RENAME(ucol_clone)
#define ucol_cloneBinary U_ICU_ENTRY_POINT_RENAME(ucol_cloneBinary)
#define ucol_close U_ICU_ENTRY_POINT_RENAME(ucol_close)
#define ucol_closeElements U_ICU_ENTRY_POINT_RENAME(ucol_closeElements)
@@ -904,6 +907,7 @@
#define udatpg_getBestPattern U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPattern)
#define udatpg_getBestPatternWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPatternWithOptions)
#define udatpg_getDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormat)
+#define udatpg_getDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormatForStyle)
#define udatpg_getDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_getDecimal)
#define udatpg_getDefaultHourCycle U_ICU_ENTRY_POINT_RENAME(udatpg_getDefaultHourCycle)
#define udatpg_getFieldDisplayName U_ICU_ENTRY_POINT_RENAME(udatpg_getFieldDisplayName)
@@ -918,6 +922,7 @@
#define udatpg_setAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemFormat)
#define udatpg_setAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemName)
#define udatpg_setDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormat)
+#define udatpg_setDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormatForStyle)
#define udatpg_setDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_setDecimal)
#define udict_swap U_ICU_ENTRY_POINT_RENAME(udict_swap)
#define udtitvfmt_close U_ICU_ENTRY_POINT_RENAME(udtitvfmt_close)
diff --git a/thirdparty/icu4c/common/unicode/uset.h b/thirdparty/icu4c/common/unicode/uset.h
index 2ef352ef56..33332f2d36 100644
--- a/thirdparty/icu4c/common/unicode/uset.h
+++ b/thirdparty/icu4c/common/unicode/uset.h
@@ -628,7 +628,6 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end);
U_CAPI void U_EXPORT2
uset_removeString(USet* set, const UChar* str, int32_t strLen);
-#ifndef U_HIDE_DRAFT_API
/**
* Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
* A frozen set will not be modified.
@@ -636,11 +635,10 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen);
* @param set the object to be modified
* @param str the string
* @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
*/
U_CAPI void U_EXPORT2
uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
-#endif // U_HIDE_DRAFT_API
/**
* Removes from this set all of its elements that are contained in the
@@ -671,7 +669,6 @@ uset_removeAll(USet* set, const USet* removeSet);
U_CAPI void U_EXPORT2
uset_retain(USet* set, UChar32 start, UChar32 end);
-#ifndef U_HIDE_DRAFT_API
/**
* Retains only the specified string from this set if it is present.
* Upon return this set will be empty if it did not contain s, or
@@ -681,7 +678,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end);
* @param set the object to be modified
* @param str the string
* @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
*/
U_CAPI void U_EXPORT2
uset_retainString(USet *set, const UChar *str, int32_t length);
@@ -693,11 +690,10 @@ uset_retainString(USet *set, const UChar *str, int32_t length);
* @param set the object to be modified
* @param str the string
* @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
*/
U_CAPI void U_EXPORT2
uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
-#endif // U_HIDE_DRAFT_API
/**
* Retains only the elements in this set that are contained in the
@@ -741,7 +737,6 @@ uset_compact(USet* set);
U_CAPI void U_EXPORT2
uset_complement(USet* set);
-#ifndef U_HIDE_DRAFT_API
/**
* Complements the specified range in this set. Any character in
* the range will be removed if it is in this set, or will be
@@ -753,7 +748,7 @@ uset_complement(USet* set);
* @param set the object to be modified
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
- * @draft ICU 69
+ * @stable ICU 69
*/
U_CAPI void U_EXPORT2
uset_complementRange(USet *set, UChar32 start, UChar32 end);
@@ -766,7 +761,7 @@ uset_complementRange(USet *set, UChar32 start, UChar32 end);
* @param set the object to be modified
* @param str the string
* @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
*/
U_CAPI void U_EXPORT2
uset_complementString(USet *set, const UChar *str, int32_t length);
@@ -778,11 +773,10 @@ uset_complementString(USet *set, const UChar *str, int32_t length);
* @param set the object to be modified
* @param str the string
* @param length the length of the string, or -1 if NUL-terminated
- * @draft ICU 69
+ * @stable ICU 69
*/
U_CAPI void U_EXPORT2
uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
-#endif // U_HIDE_DRAFT_API
/**
* Complements in this set all elements contained in the specified
diff --git a/thirdparty/icu4c/common/unicode/uvernum.h b/thirdparty/icu4c/common/unicode/uvernum.h
index 42e8865d7e..2706e0b060 100644
--- a/thirdparty/icu4c/common/unicode/uvernum.h
+++ b/thirdparty/icu4c/common/unicode/uvernum.h
@@ -60,7 +60,7 @@
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
-#define U_ICU_VERSION_MAJOR_NUM 70
+#define U_ICU_VERSION_MAJOR_NUM 71
/** The current ICU minor version as an integer.
* This value will change in the subsequent releases of ICU
@@ -86,7 +86,7 @@
* This value will change in the subsequent releases of ICU
* @stable ICU 2.6
*/
-#define U_ICU_VERSION_SUFFIX _70
+#define U_ICU_VERSION_SUFFIX _71
/**
* \def U_DEF2_ICU_ENTRY_POINT_RENAME
@@ -139,7 +139,7 @@
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
-#define U_ICU_VERSION "70.1"
+#define U_ICU_VERSION "71.1"
/**
* The current ICU library major version number as a string, for library name suffixes.
@@ -152,13 +152,13 @@
*
* @stable ICU 2.6
*/
-#define U_ICU_VERSION_SHORT "70"
+#define U_ICU_VERSION_SHORT "71"
#ifndef U_HIDE_INTERNAL_API
/** Data version in ICU4C.
* @internal ICU 4.4 Internal Use Only
**/
-#define U_ICU_DATA_VERSION "70.1"
+#define U_ICU_DATA_VERSION "71.1"
#endif /* U_HIDE_INTERNAL_API */
/*===========================================================================
diff --git a/thirdparty/icu4c/common/unistr.cpp b/thirdparty/icu4c/common/unistr.cpp
index 077b4d6ef2..c18665928d 100644
--- a/thirdparty/icu4c/common/unistr.cpp
+++ b/thirdparty/icu4c/common/unistr.cpp
@@ -334,7 +334,8 @@ Replaceable::clone() const {
// UnicodeString overrides clone() with a real implementation
UnicodeString *
UnicodeString::clone() const {
- return new UnicodeString(*this);
+ LocalPointer<UnicodeString> clonedString(new UnicodeString(*this));
+ return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr;
}
//========================================
@@ -1976,7 +1977,12 @@ The vector deleting destructor is already a part of UObject,
but defining it here makes sure that it is included with this object file.
This makes sure that static library dependencies are kept to a minimum.
*/
+#if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
static void uprv_UnicodeStringDummy(void) {
delete [] (new UnicodeString[2]);
}
+#pragma GCC diagnostic pop
+#endif
#endif
diff --git a/thirdparty/icu4c/common/ustrcase.cpp b/thirdparty/icu4c/common/ustrcase.cpp
index 36b19e75f2..43910ea520 100644
--- a/thirdparty/icu4c/common/ustrcase.cpp
+++ b/thirdparty/icu4c/common/ustrcase.cpp
@@ -36,6 +36,12 @@
#include "ustr_imp.h"
#include "uassert.h"
+/**
+ * Code point for COMBINING ACUTE ACCENT
+ * @internal
+ */
+#define ACUTE u'\u0301'
+
U_NAMESPACE_BEGIN
namespace {
@@ -396,6 +402,94 @@ U_NAMESPACE_USE
#if !UCONFIG_NO_BREAK_ITERATION
+namespace {
+
+/**
+ * Input: c is a letter I with or without acute accent.
+ * start is the index in src after c, and is less than segmentLimit.
+ * If a plain i/I is followed by a plain j/J,
+ * or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
+ * then we output accordingly.
+ *
+ * @return the src index after the titlecased sequence, or the start index if no Dutch IJ
+ */
+int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit,
+ UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
+ icu::Edits *edits) {
+ U_ASSERT(start < segmentLimit);
+
+ int32_t index = start;
+ bool withAcute = false;
+
+ // If the conditions are met, then the following variables tell us what to output.
+ int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
+ bool doTitleJ = false; // true if the j needs to be titlecased
+ int32_t unchanged2 = 0; // after the j (0 or 1)
+
+ // next character after the first letter
+ UChar c2 = src[index++];
+
+ // Is the first letter an i/I with accent?
+ if (c == u'I') {
+ if (c2 == ACUTE) {
+ withAcute = true;
+ unchanged1 = 1;
+ if (index == segmentLimit) { return start; }
+ c2 = src[index++];
+ }
+ } else { // Í
+ withAcute = true;
+ }
+
+ // Is the next character a j/J?
+ if (c2 == u'j') {
+ doTitleJ = true;
+ } else if (c2 == u'J') {
+ ++unchanged1;
+ } else {
+ return start;
+ }
+
+ // A plain i/I must be followed by a plain j/J.
+ // An i/I with acute must be followed by a j/J with acute.
+ if (withAcute) {
+ if (index == segmentLimit || src[index++] != ACUTE) { return start; }
+ if (doTitleJ) {
+ unchanged2 = 1;
+ } else {
+ ++unchanged1;
+ }
+ }
+
+ // There must not be another combining mark.
+ if (index < segmentLimit) {
+ int32_t cp;
+ int32_t i = index;
+ U16_NEXT(src, i, segmentLimit, cp);
+ uint32_t typeMask = U_GET_GC_MASK(cp);
+ if ((typeMask & U_GC_M_MASK) != 0) {
+ return start;
+ }
+ }
+
+ // Output the rest of the Dutch IJ.
+ destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
+ start += unchanged1;
+ if (doTitleJ) {
+ destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
+ if (edits != nullptr) {
+ edits->addReplace(1, 1);
+ }
+ ++start;
+ }
+ destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
+
+ U_ASSERT(start + unchanged2 == index);
+ return index;
+}
+
+} // namespace
+
U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
UChar *dest, int32_t destCapacity,
@@ -412,14 +506,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
csc.limit=srcLength;
int32_t destIndex=0;
int32_t prev=0;
- UBool isFirstIndex=TRUE;
+ bool isFirstIndex=true;
/* titlecasing loop */
while(prev<srcLength) {
/* find next index where to titlecase */
int32_t index;
if(isFirstIndex) {
- isFirstIndex=FALSE;
+ isFirstIndex=false;
index=iter->first();
} else {
index=iter->next();
@@ -446,7 +540,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
// Stop with titleStart<titleLimit<=index
// if there is a character to be titlecased,
// or else stop with titleStart==titleLimit==index.
- UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
+ bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
titleStart=titleLimit;
if(titleLimit==index) {
@@ -479,27 +573,15 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
/* Special case Dutch IJ titlecasing */
if (titleStart+1 < index &&
- caseLocale == UCASE_LOC_DUTCH &&
- (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
- if (src[titleStart+1] == 0x006A) {
- destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
- if(destIndex<0) {
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- if(edits!=NULL) {
- edits->addReplace(1, 1);
- }
- titleLimit++;
- } else if (src[titleStart+1] == 0x004A) {
- // Keep the capital J from getting lowercased.
- destIndex=appendUnchanged(dest, destIndex, destCapacity,
- src+titleStart+1, 1, options, edits);
- if(destIndex<0) {
- errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0;
- }
- titleLimit++;
+ caseLocale == UCASE_LOC_DUTCH) {
+ if (c < 0) {
+ c = ~c;
+ }
+
+ if (c == u'I' || c == u'Í') {
+ titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
+ dest, destIndex, destCapacity, options,
+ edits);
}
}
diff --git a/thirdparty/icu4c/common/uvector.cpp b/thirdparty/icu4c/common/uvector.cpp
index 4da8b864e1..844463921e 100644
--- a/thirdparty/icu4c/common/uvector.cpp
+++ b/thirdparty/icu4c/common/uvector.cpp
@@ -99,14 +99,6 @@ bool UVector::operator==(const UVector& other) const {
return true;
}
-// TODO: delete this function once all call sites have been migrated to the
-// new addElement().
-void UVector::addElementX(void* obj, UErrorCode &status) {
- if (ensureCapacityX(count + 1, status)) {
- elements[count++].pointer = obj;
- }
-}
-
void UVector::addElement(void* obj, UErrorCode &status) {
U_ASSERT(deleter == nullptr);
if (ensureCapacity(count + 1, status)) {
@@ -331,38 +323,6 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const {
return -1;
}
-UBool UVector::ensureCapacityX(int32_t minimumCapacity, UErrorCode &status) {
- if (minimumCapacity < 0) {
- status = U_ILLEGAL_ARGUMENT_ERROR;
- return FALSE;
- }
- if (capacity < minimumCapacity) {
- if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check
- status = U_ILLEGAL_ARGUMENT_ERROR;
- return FALSE;
- }
- int32_t newCap = capacity * 2;
- if (newCap < minimumCapacity) {
- newCap = minimumCapacity;
- }
- if (newCap > (int32_t)(INT32_MAX / sizeof(UElement))) { // integer overflow check
- // We keep the original memory contents on bad minimumCapacity.
- status = U_ILLEGAL_ARGUMENT_ERROR;
- return FALSE;
- }
- UElement* newElems = (UElement *)uprv_realloc(elements, sizeof(UElement)*newCap);
- if (newElems == nullptr) {
- // We keep the original contents on the memory failure on realloc or bad minimumCapacity.
- status = U_MEMORY_ALLOCATION_ERROR;
- return FALSE;
- }
- elements = newElems;
- capacity = newCap;
- }
- return TRUE;
-}
-
-
UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
if (U_FAILURE(status)) {
return false;
@@ -370,7 +330,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
if (minimumCapacity < 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return false;
- }
+ }
if (capacity < minimumCapacity) {
if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check
status = U_ILLEGAL_ARGUMENT_ERROR;
@@ -396,6 +356,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
}
return true;
}
+
/**
* Change the size of this vector as follows: If newSize is smaller,
* then truncate the array, possibly deleting held elements for i >=
diff --git a/thirdparty/icu4c/common/uvector.h b/thirdparty/icu4c/common/uvector.h
index f61fcc2be6..1eb7d136e7 100644
--- a/thirdparty/icu4c/common/uvector.h
+++ b/thirdparty/icu4c/common/uvector.h
@@ -123,12 +123,6 @@ public:
// java.util.Vector API
//------------------------------------------------------------
- /*
- * Old version of addElement, with non-standard error handling.
- * Will be removed once all uses have been switched to the new addElement().
- */
- void addElementX(void* obj, UErrorCode &status);
-
/**
* Add an element at the end of the vector.
* For use only with vectors that do not adopt their elements, which is to say,
@@ -197,12 +191,6 @@ public:
inline UBool isEmpty(void) const {return count == 0;}
- /*
- * Old version of ensureCapacity, with non-standard error handling.
- * Will be removed once all uses have been switched to the new ensureCapacity().
- */
- UBool ensureCapacityX(int32_t minimumCapacity, UErrorCode &status);
-
UBool ensureCapacity(int32_t minimumCapacity, UErrorCode &status);
/**
diff --git a/thirdparty/icu4c/common/uvectr32.cpp b/thirdparty/icu4c/common/uvectr32.cpp
index a77ecb689f..2b4d0b8a75 100644
--- a/thirdparty/icu4c/common/uvectr32.cpp
+++ b/thirdparty/icu4c/common/uvectr32.cpp
@@ -83,7 +83,7 @@ void UVector32::assign(const UVector32& other, UErrorCode &ec) {
}
-bool UVector32::operator==(const UVector32& other) {
+bool UVector32::operator==(const UVector32& other) const {
int32_t i;
if (count != other.count) return false;
for (i=0; i<count; ++i) {
diff --git a/thirdparty/icu4c/common/uvectr32.h b/thirdparty/icu4c/common/uvectr32.h
index f08c2ade76..ecefb7af3e 100644
--- a/thirdparty/icu4c/common/uvectr32.h
+++ b/thirdparty/icu4c/common/uvectr32.h
@@ -86,12 +86,12 @@ public:
* equal if they are of the same size and all elements are equal,
* as compared using this object's comparer.
*/
- bool operator==(const UVector32& other);
+ bool operator==(const UVector32& other) const;
/**
* Equivalent to !operator==()
*/
- inline bool operator!=(const UVector32& other);
+ inline bool operator!=(const UVector32& other) const;
//------------------------------------------------------------
// java.util.Vector API
@@ -268,7 +268,7 @@ inline int32_t UVector32::lastElementi(void) const {
return elementAti(count-1);
}
-inline bool UVector32::operator!=(const UVector32& other) {
+inline bool UVector32::operator!=(const UVector32& other) const {
return !operator==(other);
}