diff options
Diffstat (limited to 'thirdparty/icu4c/common/dictbe.cpp')
-rw-r--r-- | thirdparty/icu4c/common/dictbe.cpp | 165 |
1 files changed, 125 insertions, 40 deletions
diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp index 4d158e3226..4fdbdf2760 100644 --- a/thirdparty/icu4c/common/dictbe.cpp +++ b/thirdparty/icu4c/common/dictbe.cpp @@ -17,7 +17,10 @@ #include "dictbe.h" #include "unicode/uniset.h" #include "unicode/chariter.h" +#include "unicode/resbund.h" #include "unicode/ubrk.h" +#include "unicode/usetiter.h" +#include "ubrkimpl.h" #include "utracimp.h" #include "uvectr32.h" #include "uvector.h" @@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text, int32_t startPos, int32_t endPos, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; (void)startPos; // TODO: remove this param? @@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text, } rangeStart = start; rangeEnd = current; - result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status); + result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status); utext_setNativeIndex(text, current); return result; @@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai"); - fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); + UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fThaiWordSet); + setCharacters(thaiWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fThaiWordSet; + fEndWordSet = thaiWordSet; fEndWordSet.remove(0x0E31); // MAI HAN-AKAT fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK @@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; utext_setNativeIndex(text, rangeStart); @@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo"); - fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); + UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fLaoWordSet); + setCharacters(laoWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fLaoWordSet; + fEndWordSet = laoWordSet; fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) @@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { @@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr"); - fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); + fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels + fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.add(0x0020); if (U_SUCCESS(status)) { - setCharacters(fBurmeseWordSet); + setCharacters(fEndWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); - fMarkSet.add(0x0020); - fEndWordSet = fBurmeseWordSet; - fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels // Compact for caching. fMarkSet.compact(); @@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { @@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr"); - fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); + UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status); if (U_SUCCESS(status)) { - setCharacters(fKhmerWordSet); + setCharacters(khmerWordSet); } - fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); + fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); fMarkSet.add(0x0020); - fEndWordSet = fKhmerWordSet; + fEndWordSet = khmerWordSet; fBeginWordSet.add(0x1780, 0x17B3); //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word @@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool /* isPhraseBreaking */, UErrorCode& status ) const { if (U_FAILURE(status)) return 0; if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { @@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType : DictionaryBreakEngine(), fDictionary(adoptDictionary) { UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE); UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani"); - // Korean dictionary only includes Hangul syllables - fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); - fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); - fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); - fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); nfkcNorm2 = Normalizer2::getNFKCInstance(status); - - if (U_SUCCESS(status)) { - // handle Korean and Japanese/Chinese using different dictionaries - if (type == kKorean) { + // Korean dictionary only includes Hangul syllables + fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status); + fHangulWordSet.compact(); + // Digits, open puncutation and Alphabetic characters. + fDigitOrOpenPunctuationOrAlphabetSet.applyPattern( + UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status); + fDigitOrOpenPunctuationOrAlphabetSet.compact(); + fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status); + fClosePunctuationSet.compact(); + + // handle Korean and Japanese/Chinese using different dictionaries + if (type == kKorean) { + if (U_SUCCESS(status)) { setCharacters(fHangulWordSet); - } else { //Chinese and Japanese - UnicodeSet cjSet; - cjSet.addAll(fHanWordSet); - cjSet.addAll(fKatakanaWordSet); - cjSet.addAll(fHiraganaWordSet); - cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK - cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK + } + } else { //Chinese and Japanese + UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status); + if (U_SUCCESS(status)) { setCharacters(cjSet); + initJapanesePhraseParameter(status); } } UTRACE_EXIT_STATUS(status); @@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) { (value >= 0xFF66 && value <= 0xFF9f); } - // Function for accessing internal utext flags. // Replicates an internal UText function. static inline int32_t utext_i32_flag(int32_t bitIndex) { return (int32_t)1 << bitIndex; } - /* * @param text A UText representing the text @@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, int32_t rangeStart, int32_t rangeEnd, UVector32 &foundBreaks, + UBool isPhraseBreaking, UErrorCode& status) const { if (U_FAILURE(status)) return 0; if (rangeStart >= rangeEnd) { @@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { t_boundary.addElement(numCodePts, status); numBreaks++; + } else if (isPhraseBreaking) { + t_boundary.addElement(numCodePts, status); + if(U_SUCCESS(status)) { + numBreaks++; + int32_t prevIdx = numCodePts; + + int32_t codeUnitIdx = -1; + int32_t prevCodeUnitIdx = -1; + int32_t length = -1; + for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) { + codeUnitIdx = inString.moveIndex32(0, i); + prevCodeUnitIdx = inString.moveIndex32(0, prevIdx); + // Calculate the length by using the code unit. + length = prevCodeUnitIdx - codeUnitIdx; + prevIdx = i; + // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana + // characters don't occur. + if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length)) + && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1))) + || !isKatakana(inString.char32At(codeUnitIdx)))) { + t_boundary.addElement(i, status); + numBreaks++; + } + } + } } else { for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { t_boundary.addElement(i, status); @@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, // while reversing t_boundary and pushing values to foundBreaks. int32_t prevCPPos = -1; int32_t prevUTextPos = -1; - for (int32_t i = numBreaks-1; i >= 0; i--) { + int32_t correctedNumBreaks = 0; + for (int32_t i = numBreaks - 1; i >= 0; i--) { int32_t cpPos = t_boundary.elementAti(i); U_ASSERT(cpPos > prevCPPos); int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart; @@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, if (utextPos > prevUTextPos) { // Boundaries are added to foundBreaks output in ascending order. U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos); - foundBreaks.push(utextPos, status); + // In phrase breaking, there has to be a breakpoint between Cj character and close + // punctuation. + // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 + if (utextPos != rangeStart + || (isPhraseBreaking && utextPos > 0 + && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) { + foundBreaks.push(utextPos, status); + correctedNumBreaks++; + } } else { // Normalization expanded the input text, the dictionary found a boundary // within the expansion, giving two boundaries with the same index in the @@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } (void)prevCPPos; // suppress compiler warnings about unused variable + UChar32 nextChar = utext_char32At(inText, rangeEnd); + if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) { + // In phrase breaking, there has to be a breakpoint between Cj character and + // the number/open punctuation. + // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 + // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 + // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U + if (isPhraseBreaking) { + if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) { + foundBreaks.popi(); + correctedNumBreaks--; + } + } else { + foundBreaks.popi(); + correctedNumBreaks--; + } + } + // inString goes out of scope // inputMap goes out of scope - return numBreaks; + return correctedNumBreaks; +} + +void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) { + loadJapaneseExtensions(error); + loadHiragana(error); +} + +void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) { + const char* tag = "extensions"; + ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error); + if (U_SUCCESS(error)) { + ResourceBundle bundle = ja.get(tag, error); + while (U_SUCCESS(error) && bundle.hasNext()) { + fSkipSet.puti(bundle.getNextString(error), 1, error); + } + } +} + +void CjkBreakEngine::loadHiragana(UErrorCode& error) { + UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error); + hiraganaWordSet.compact(); + UnicodeSetIterator iterator(hiraganaWordSet); + while (iterator.next()) { + fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error); + } } #endif |