// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ************************************************************************************ * Copyright (C) 2006-2016, International Business Machines Corporation * and others. All Rights Reserved. ************************************************************************************ */ #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/uchar.h" #include "unicode/uniset.h" #include "unicode/chariter.h" #include "unicode/ures.h" #include "unicode/udata.h" #include "unicode/putil.h" #include "unicode/ustring.h" #include "unicode/uscript.h" #include "unicode/ucharstrie.h" #include "unicode/bytestrie.h" #include "brkeng.h" #include "cmemory.h" #include "dictbe.h" #include "lstmbe.h" #include "charstr.h" #include "dictionarydata.h" #include "mutex.h" #include "uvector.h" #include "umutex.h" #include "uresimp.h" #include "ubrkimpl.h" U_NAMESPACE_BEGIN /* ****************************************************************** */ LanguageBreakEngine::LanguageBreakEngine() { } LanguageBreakEngine::~LanguageBreakEngine() { } /* ****************************************************************** */ LanguageBreakFactory::LanguageBreakFactory() { } LanguageBreakFactory::~LanguageBreakFactory() { } /* ****************************************************************** */ UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { (void)status; } UnhandledEngine::~UnhandledEngine() { delete fHandled; fHandled = nullptr; } UBool UnhandledEngine::handles(UChar32 c) const { return fHandled && fHandled->contains(c); } int32_t UnhandledEngine::findBreaks( UText *text, int32_t /* startPos */, int32_t endPos, UVector32 &/*foundBreaks*/, UBool /* isPhraseBreaking */, UErrorCode &status) const { if (U_FAILURE(status)) return 0; UChar32 c = utext_current32(text); while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text); } return 0; } void UnhandledEngine::handleCharacter(UChar32 c) { if (fHandled == nullptr) { fHandled = new UnicodeSet(); if (fHandled == nullptr) { return; } } if (!fHandled->contains(c)) { UErrorCode status = U_ZERO_ERROR; // Apply the entire script of the character. int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); } } /* ****************************************************************** */ ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { fEngines = 0; } ICULanguageBreakFactory::~ICULanguageBreakFactory() { if (fEngines != 0) { delete fEngines; } } U_NAMESPACE_END U_CDECL_BEGIN static void U_CALLCONV _deleteEngine(void *obj) { delete (const icu::LanguageBreakEngine *) obj; } U_CDECL_END U_NAMESPACE_BEGIN const LanguageBreakEngine * ICULanguageBreakFactory::getEngineFor(UChar32 c) { const LanguageBreakEngine *lbe = NULL; UErrorCode status = U_ZERO_ERROR; static UMutex gBreakEngineMutex; Mutex m(&gBreakEngineMutex); if (fEngines == nullptr) { LocalPointer<UStack> engines(new UStack(_deleteEngine, nullptr, status), status); if (U_FAILURE(status) ) { // Note: no way to return error code to caller. return nullptr; } fEngines = engines.orphan(); } else { int32_t i = fEngines->size(); while (--i >= 0) { lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); if (lbe != NULL && lbe->handles(c)) { return lbe; } } } // We didn't find an engine. Create one. lbe = loadEngineFor(c); if (lbe != nullptr) { fEngines->push((void *)lbe, status); } return U_SUCCESS(status) ? lbe : nullptr; } const LanguageBreakEngine * ICULanguageBreakFactory::loadEngineFor(UChar32 c) { UErrorCode status = U_ZERO_ERROR; UScriptCode code = uscript_getScript(c, &status); if (U_SUCCESS(status)) { const LanguageBreakEngine *engine = nullptr; // Try to use LSTM first const LSTMData *data = CreateLSTMDataForScript(code, status); if (U_SUCCESS(status)) { if (data != nullptr) { engine = CreateLSTMBreakEngine(code, data, status); if (U_SUCCESS(status) && engine != nullptr) { return engine; } if (engine != nullptr) { delete engine; engine = nullptr; } else { DeleteLSTMData(data); } } } status = U_ZERO_ERROR; // fallback to dictionary based DictionaryMatcher *m = loadDictionaryMatcherFor(code); if (m != NULL) { switch(code) { case USCRIPT_THAI: engine = new ThaiBreakEngine(m, status); break; case USCRIPT_LAO: engine = new LaoBreakEngine(m, status); break; case USCRIPT_MYANMAR: engine = new BurmeseBreakEngine(m, status); break; case USCRIPT_KHMER: engine = new KhmerBreakEngine(m, status); break; #if !UCONFIG_NO_NORMALIZATION // CJK not available w/o normalization case USCRIPT_HANGUL: engine = new CjkBreakEngine(m, kKorean, status); break; // use same BreakEngine and dictionary for both Chinese and Japanese case USCRIPT_HIRAGANA: case USCRIPT_KATAKANA: case USCRIPT_HAN: engine = new CjkBreakEngine(m, kChineseJapanese, status); break; #if 0 // TODO: Have to get some characters with script=common handled // by CjkBreakEngine (e.g. U+309B). Simply subjecting // them to CjkBreakEngine does not work. The engine has to // special-case them. case USCRIPT_COMMON: { UBlockCode block = ublock_getCode(code); if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) engine = new CjkBreakEngine(dict, kChineseJapanese, status); break; } #endif #endif default: break; } if (engine == NULL) { delete m; } else if (U_FAILURE(status)) { delete engine; engine = NULL; } return engine; } } return NULL; } DictionaryMatcher * ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { UErrorCode status = U_ZERO_ERROR; // open root from brkitr tree. UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); int32_t dictnlength = 0; const UChar *dictfname = ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); if (U_FAILURE(status)) { ures_close(b); return NULL; } CharString dictnbuf; CharString ext; const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot if (extStart != NULL) { int32_t len = (int32_t)(extStart - dictfname); ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); dictnlength = len; } dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); ures_close(b); UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); if (U_SUCCESS(status)) { // build trie const uint8_t *data = (const uint8_t *)udata_getMemory(file); const int32_t *indexes = (const int32_t *)data; const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; DictionaryMatcher *m = NULL; if (trieType == DictionaryData::TRIE_TYPE_BYTES) { const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; const char *characters = (const char *)(data + offset); m = new BytesDictionaryMatcher(characters, transform, file); } else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { const UChar *characters = (const UChar *)(data + offset); m = new UCharsDictionaryMatcher(characters, file); } if (m == NULL) { // no matcher exists to take ownership - either we are an invalid // type or memory allocation failed udata_close(file); } return m; } else if (dictfname != NULL) { // we don't have a dictionary matcher. // returning NULL here will cause us to fail to find a dictionary break engine, as expected status = U_ZERO_ERROR; return NULL; } return NULL; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */