summaryrefslogtreecommitdiff
path: root/thirdparty/icu4c/common/dictbe.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/icu4c/common/dictbe.cpp')
-rw-r--r--thirdparty/icu4c/common/dictbe.cpp165
1 files changed, 125 insertions, 40 deletions
diff --git a/thirdparty/icu4c/common/dictbe.cpp b/thirdparty/icu4c/common/dictbe.cpp
index 4d158e3226..4fdbdf2760 100644
--- a/thirdparty/icu4c/common/dictbe.cpp
+++ b/thirdparty/icu4c/common/dictbe.cpp
@@ -17,7 +17,10 @@
#include "dictbe.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
+#include "unicode/resbund.h"
#include "unicode/ubrk.h"
+#include "unicode/usetiter.h"
+#include "ubrkimpl.h"
#include "utracimp.h"
#include "uvectr32.h"
#include "uvector.h"
@@ -48,6 +51,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
(void)startPos; // TODO: remove this param?
@@ -68,7 +72,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
}
rangeStart = start;
rangeEnd = current;
- result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
+ result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
utext_setNativeIndex(text, current);
return result;
@@ -199,13 +203,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
- fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
+ UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
- setCharacters(fThaiWordSet);
+ setCharacters(thaiWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
- fEndWordSet = fThaiWordSet;
+ fEndWordSet = thaiWordSet;
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
@@ -230,6 +234,7 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
utext_setNativeIndex(text, rangeStart);
@@ -441,13 +446,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
- fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
+ UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
- setCharacters(fLaoWordSet);
+ setCharacters(laoWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
- fEndWordSet = fLaoWordSet;
+ fEndWordSet = laoWordSet;
fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
@@ -469,6 +474,7 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
@@ -637,14 +643,13 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
- fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
+ fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
+ fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.add(0x0020);
if (U_SUCCESS(status)) {
- setCharacters(fBurmeseWordSet);
+ setCharacters(fEndWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
- fMarkSet.add(0x0020);
- fEndWordSet = fBurmeseWordSet;
- fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
// Compact for caching.
fMarkSet.compact();
@@ -662,6 +667,7 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status ) const {
if (U_FAILURE(status)) return 0;
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
@@ -830,13 +836,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
{
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
- fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
+ UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
if (U_SUCCESS(status)) {
- setCharacters(fKhmerWordSet);
+ setCharacters(khmerWordSet);
}
- fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
+ fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
fMarkSet.add(0x0020);
- fEndWordSet = fKhmerWordSet;
+ fEndWordSet = khmerWordSet;
fBeginWordSet.add(0x1780, 0x17B3);
//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
@@ -867,6 +873,7 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool /* isPhraseBreaking */,
UErrorCode& status ) const {
if (U_FAILURE(status)) return 0;
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
@@ -1050,25 +1057,27 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
- // Korean dictionary only includes Hangul syllables
- fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
- fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
- fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
- fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
-
- if (U_SUCCESS(status)) {
- // handle Korean and Japanese/Chinese using different dictionaries
- if (type == kKorean) {
+ // Korean dictionary only includes Hangul syllables
+ fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
+ fHangulWordSet.compact();
+ // Digits, open puncutation and Alphabetic characters.
+ fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
+ UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
+ fDigitOrOpenPunctuationOrAlphabetSet.compact();
+ fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
+ fClosePunctuationSet.compact();
+
+ // handle Korean and Japanese/Chinese using different dictionaries
+ if (type == kKorean) {
+ if (U_SUCCESS(status)) {
setCharacters(fHangulWordSet);
- } else { //Chinese and Japanese
- UnicodeSet cjSet;
- cjSet.addAll(fHanWordSet);
- cjSet.addAll(fKatakanaWordSet);
- cjSet.addAll(fHiraganaWordSet);
- cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
- cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
+ }
+ } else { //Chinese and Japanese
+ UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
+ if (U_SUCCESS(status)) {
setCharacters(cjSet);
+ initJapanesePhraseParameter(status);
}
}
UTRACE_EXIT_STATUS(status);
@@ -1096,14 +1105,12 @@ static inline bool isKatakana(UChar32 value) {
(value >= 0xFF66 && value <= 0xFF9f);
}
-
// Function for accessing internal utext flags.
// Replicates an internal UText function.
static inline int32_t utext_i32_flag(int32_t bitIndex) {
return (int32_t)1 << bitIndex;
}
-
/*
* @param text A UText representing the text
@@ -1117,6 +1124,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
int32_t rangeStart,
int32_t rangeEnd,
UVector32 &foundBreaks,
+ UBool isPhraseBreaking,
UErrorCode& status) const {
if (U_FAILURE(status)) return 0;
if (rangeStart >= rangeEnd) {
@@ -1347,6 +1355,31 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
t_boundary.addElement(numCodePts, status);
numBreaks++;
+ } else if (isPhraseBreaking) {
+ t_boundary.addElement(numCodePts, status);
+ if(U_SUCCESS(status)) {
+ numBreaks++;
+ int32_t prevIdx = numCodePts;
+
+ int32_t codeUnitIdx = -1;
+ int32_t prevCodeUnitIdx = -1;
+ int32_t length = -1;
+ for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
+ codeUnitIdx = inString.moveIndex32(0, i);
+ prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
+ // Calculate the length by using the code unit.
+ length = prevCodeUnitIdx - codeUnitIdx;
+ prevIdx = i;
+ // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
+ // characters don't occur.
+ if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
+ && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
+ || !isKatakana(inString.char32At(codeUnitIdx)))) {
+ t_boundary.addElement(i, status);
+ numBreaks++;
+ }
+ }
+ }
} else {
for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
t_boundary.addElement(i, status);
@@ -1367,7 +1400,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
// while reversing t_boundary and pushing values to foundBreaks.
int32_t prevCPPos = -1;
int32_t prevUTextPos = -1;
- for (int32_t i = numBreaks-1; i >= 0; i--) {
+ int32_t correctedNumBreaks = 0;
+ for (int32_t i = numBreaks - 1; i >= 0; i--) {
int32_t cpPos = t_boundary.elementAti(i);
U_ASSERT(cpPos > prevCPPos);
int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
@@ -1375,7 +1409,15 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
if (utextPos > prevUTextPos) {
// Boundaries are added to foundBreaks output in ascending order.
U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
- foundBreaks.push(utextPos, status);
+ // In phrase breaking, there has to be a breakpoint between Cj character and close
+ // punctuation.
+ // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
+ if (utextPos != rangeStart
+ || (isPhraseBreaking && utextPos > 0
+ && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
+ foundBreaks.push(utextPos, status);
+ correctedNumBreaks++;
+ }
} else {
// Normalization expanded the input text, the dictionary found a boundary
// within the expansion, giving two boundaries with the same index in the
@@ -1387,9 +1429,52 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
}
(void)prevCPPos; // suppress compiler warnings about unused variable
+ UChar32 nextChar = utext_char32At(inText, rangeEnd);
+ if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
+ // In phrase breaking, there has to be a breakpoint between Cj character and
+ // the number/open punctuation.
+ // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
+ // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
+ // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
+ if (isPhraseBreaking) {
+ if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
+ foundBreaks.popi();
+ correctedNumBreaks--;
+ }
+ } else {
+ foundBreaks.popi();
+ correctedNumBreaks--;
+ }
+ }
+
// inString goes out of scope
// inputMap goes out of scope
- return numBreaks;
+ return correctedNumBreaks;
+}
+
+void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
+ loadJapaneseExtensions(error);
+ loadHiragana(error);
+}
+
+void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
+ const char* tag = "extensions";
+ ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
+ if (U_SUCCESS(error)) {
+ ResourceBundle bundle = ja.get(tag, error);
+ while (U_SUCCESS(error) && bundle.hasNext()) {
+ fSkipSet.puti(bundle.getNextString(error), 1, error);
+ }
+ }
+}
+
+void CjkBreakEngine::loadHiragana(UErrorCode& error) {
+ UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);
+ hiraganaWordSet.compact();
+ UnicodeSetIterator iterator(hiraganaWordSet);
+ while (iterator.next()) {
+ fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);
+ }
}
#endif