1 files changed, 125 insertions, 40 deletions
diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp
index 4d158e3226..4fdbdf2760 100644
--- a/thirdparty/icu4c/common/dictbe.cpp
+++ b/thirdparty/icu4c/common/dictbe.cpp
@@ -17,7 +17,10 @@
 #include "dictbe.h"
 #include "unicode/uniset.h"
 #include "unicode/chariter.h"
+#include "unicode/resbund.h"
 #include "unicode/ubrk.h"
+#include "unicode/usetiter.h"
+#include "ubrkimpl.h"
 #include "utracimp.h"
 #include "uvectr32.h"
 #include "uvector.h"
@@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
                                  int32_t startPos,
                                  int32_t endPos,
                                  UVector32 &foundBreaks,
+                                 UBool isPhraseBreaking,
                                  UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     (void)startPos;            // TODO: remove this param?
@@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
     }
     rangeStart = start;
     rangeEnd = current;
-    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
+    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
     utext_setNativeIndex(text, current);
     
     return result;
@@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
-    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
+    UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
     if (U_SUCCESS(status)) {
-        setCharacters(fThaiWordSet);
+        setCharacters(thaiWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
     fMarkSet.add(0x0020);
-    fEndWordSet = fThaiWordSet;
+    fEndWordSet = thaiWordSet;
     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
@@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     utext_setNativeIndex(text, rangeStart);
@@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
-    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
+    UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
     if (U_SUCCESS(status)) {
-        setCharacters(fLaoWordSet);
+        setCharacters(laoWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
     fMarkSet.add(0x0020);
-    fEndWordSet = fLaoWordSet;
+    fEndWordSet = laoWordSet;
     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
@@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
@@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
-    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
+    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
+    fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.add(0x0020);
     if (U_SUCCESS(status)) {
-        setCharacters(fBurmeseWordSet);
+        setCharacters(fEndWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
-    fMarkSet.add(0x0020);
-    fEndWordSet = fBurmeseWordSet;
-    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
 
     // Compact for caching.
     fMarkSet.compact();
@@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status ) const {
     if (U_FAILURE(status)) return 0;
     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
@@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
 {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
-    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
+    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
     if (U_SUCCESS(status)) {
-        setCharacters(fKhmerWordSet);
+        setCharacters(khmerWordSet);
     }
-    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
+    fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
     fMarkSet.add(0x0020);
-    fEndWordSet = fKhmerWordSet;
+    fEndWordSet = khmerWordSet;
     fBeginWordSet.add(0x1780, 0x17B3);
     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
@@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                                                 int32_t rangeStart,
                                                 int32_t rangeEnd,
                                                 UVector32 &foundBreaks,
+                                                UBool /* isPhraseBreaking */,
                                                 UErrorCode& status ) const {
     if (U_FAILURE(status)) return 0;
     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
@@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
 : DictionaryBreakEngine(), fDictionary(adoptDictionary) {
     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
-    // Korean dictionary only includes Hangul syllables
-    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
-    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
-    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
-    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
     nfkcNorm2 = Normalizer2::getNFKCInstance(status);
-
-    if (U_SUCCESS(status)) {
-        // handle Korean and Japanese/Chinese using different dictionaries
-        if (type == kKorean) {
+    // Korean dictionary only includes Hangul syllables
+    fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
+    fHangulWordSet.compact();
+    // Digits, open puncutation and Alphabetic characters.
+    fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
+        UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
+    fDigitOrOpenPunctuationOrAlphabetSet.compact();
+    fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
+    fClosePunctuationSet.compact();
+
+    // handle Korean and Japanese/Chinese using different dictionaries
+    if (type == kKorean) {
+        if (U_SUCCESS(status)) {
             setCharacters(fHangulWordSet);
-        } else { //Chinese and Japanese
-            UnicodeSet cjSet;
-            cjSet.addAll(fHanWordSet);
-            cjSet.addAll(fKatakanaWordSet);
-            cjSet.addAll(fHiraganaWordSet);
-            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
-            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
+        }
+    } else { //Chinese and Japanese
+        UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
+        if (U_SUCCESS(status)) {
             setCharacters(cjSet);
+            initJapanesePhraseParameter(status);
         }
     }
     UTRACE_EXIT_STATUS(status);
@@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) {
             (value >= 0xFF66 && value <= 0xFF9f);
 }
 
-
 // Function for accessing internal utext flags.
 //   Replicates an internal UText function.
 
 static inline int32_t utext_i32_flag(int32_t bitIndex) {
     return (int32_t)1 << bitIndex;
 }
-
        
 /*
  * @param text A UText representing the text
@@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
         int32_t rangeStart,
         int32_t rangeEnd,
         UVector32 &foundBreaks,
+        UBool isPhraseBreaking,
         UErrorCode& status) const {
     if (U_FAILURE(status)) return 0;
     if (rangeStart >= rangeEnd) {
@@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
         t_boundary.addElement(numCodePts, status);
         numBreaks++;
+    } else if (isPhraseBreaking) {
+        t_boundary.addElement(numCodePts, status);
+        if(U_SUCCESS(status)) {
+            numBreaks++;
+            int32_t prevIdx = numCodePts;
+
+            int32_t codeUnitIdx = -1;
+            int32_t prevCodeUnitIdx = -1;
+            int32_t length = -1;
+            for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
+                codeUnitIdx = inString.moveIndex32(0, i);
+                prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
+                // Calculate the length by using the code unit.
+                length = prevCodeUnitIdx - codeUnitIdx;
+                prevIdx = i;
+                // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
+                // characters don't occur.
+                if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
+                    && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
+                           || !isKatakana(inString.char32At(codeUnitIdx)))) {
+                    t_boundary.addElement(i, status);
+                    numBreaks++;
+                }
+            }
+        }
     } else {
         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
             t_boundary.addElement(i, status);
@@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     // while reversing t_boundary and pushing values to foundBreaks.
     int32_t prevCPPos = -1;
     int32_t prevUTextPos = -1;
-    for (int32_t i = numBreaks-1; i >= 0; i--) {
+    int32_t correctedNumBreaks = 0;
+    for (int32_t i = numBreaks - 1; i >= 0; i--) {
         int32_t cpPos = t_boundary.elementAti(i);
         U_ASSERT(cpPos > prevCPPos);
         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
@@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
         if (utextPos > prevUTextPos) {
             // Boundaries are added to foundBreaks output in ascending order.
             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
-            foundBreaks.push(utextPos, status);
+            // In phrase breaking, there has to be a breakpoint between Cj character and close
+            // punctuation.
+            // E.g.［携帯電話］正しい選択 -> ［携帯▁電話］▁正しい▁選択 -> breakpoint between ］ and 正
+            if (utextPos != rangeStart
+                || (isPhraseBreaking && utextPos > 0
+                       && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
+                foundBreaks.push(utextPos, status);
+                correctedNumBreaks++;
+            }
         } else {
             // Normalization expanded the input text, the dictionary found a boundary
             // within the expansion, giving two boundaries with the same index in the
@@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
     }
     (void)prevCPPos; // suppress compiler warnings about unused variable
 
+    UChar32 nextChar = utext_char32At(inText, rangeEnd);
+    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+        // In phrase breaking, there has to be a breakpoint between Cj character and
+        // the number/open punctuation.
+        // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+        // E.g. 乗車率９０％程度だろうか -> 乗車▁率▁９０％▁程度だろうか -> breakpoint between 率 and ９
+        // E.g. しかもロゴがＵｎｉｃｏｄｅ！ -> しかも▁ロゴが▁Ｕｎｉｃｏｄｅ！-> breakpoint between が and Ｕ
+        if (isPhraseBreaking) {
+            if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
+                foundBreaks.popi();
+                correctedNumBreaks--;
+            }
+        } else {
+            foundBreaks.popi();
+            correctedNumBreaks--;
+        }
+    }
+
     // inString goes out of scope
     // inputMap goes out of scope
-    return numBreaks;
+    return correctedNumBreaks;
+}
+
+void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
+    loadJapaneseExtensions(error);
+    loadHiragana(error);
+}
+
+void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
+    const char* tag = "extensions";
+    ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
+    if (U_SUCCESS(error)) {
+        ResourceBundle bundle = ja.get(tag, error);
+        while (U_SUCCESS(error) && bundle.hasNext()) {
+            fSkipSet.puti(bundle.getNextString(error), 1, error);
+        }
+    }
+}
+
+void CjkBreakEngine::loadHiragana(UErrorCode& error) {
+    UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);
+    hiraganaWordSet.compact();
+    UnicodeSetIterator iterator(hiraganaWordSet);
+    while (iterator.next()) {
+        fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);
+    }
 }
 #endif