// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // rbbisetb.cpp // /* *************************************************************************** * Copyright (C) 2002-2008 International Business Machines Corporation * * and others. All rights reserved. * *************************************************************************** */ // // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules // (part of the rule building process.) // // Starting with the rules parse tree from the scanner, // // - Enumerate the set of UnicodeSets that are referenced // by the RBBI rules. // - compute a set of non-overlapping character ranges // with all characters within a range belonging to the same // set of input unicode sets. // - Derive a set of non-overlapping UnicodeSet (like things) // that will correspond to columns in the state table for // the RBBI execution engine. All characters within one // of these sets belong to the same set of the original // UnicodeSets from the user's rules. // - construct the trie table that maps input characters // to the index of the matching non-overlapping set of set from // the previous step. // #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/uniset.h" #include "uvector.h" #include "uassert.h" #include "cmemory.h" #include "cstring.h" #include "rbbisetb.h" #include "rbbinode.h" U_NAMESPACE_BEGIN const int32_t kMaxCharCategoriesFor8BitsTrie = 255; //------------------------------------------------------------------------ // // Constructor // //------------------------------------------------------------------------ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) { fRB = rb; fStatus = rb->fStatus; fRangeList = nullptr; fMutableTrie = nullptr; fTrie = nullptr; fTrieSize = 0; fGroupCount = 0; fSawBOF = false; } //------------------------------------------------------------------------ // // Destructor // //------------------------------------------------------------------------ RBBISetBuilder::~RBBISetBuilder() { RangeDescriptor *nextRangeDesc; // Walk through & delete the linked list of RangeDescriptors for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) { RangeDescriptor *r = nextRangeDesc; nextRangeDesc = r->fNext; delete r; } ucptrie_close(fTrie); umutablecptrie_close(fMutableTrie); } //------------------------------------------------------------------------ // // build Build the list of non-overlapping character ranges // from the Unicode Sets. // //------------------------------------------------------------------------ void RBBISetBuilder::buildRanges() { RBBINode *usetNode; RangeDescriptor *rlRange; if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();} // // Initialize the process by creating a single range encompassing all characters // that is in no sets. // fRangeList = new RangeDescriptor(*fStatus); // will check for status here if (fRangeList == NULL) { *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } fRangeList->fStartChar = 0; fRangeList->fEndChar = 0x10ffff; if (U_FAILURE(*fStatus)) { return; } // // Find the set of non-overlapping ranges of characters // int ni; for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; int32_t inputSetRangeCount = inputSet->getRangeCount(); int inputSetRangeIndex = 0; rlRange = fRangeList; for (;;) { if (inputSetRangeIndex >= inputSetRangeCount) { break; } UChar32 inputSetRangeBegin = inputSet->getRangeStart(inputSetRangeIndex); UChar32 inputSetRangeEnd = inputSet->getRangeEnd(inputSetRangeIndex); // skip over ranges from the range list that are completely // below the current range from the input unicode set. while (rlRange->fEndChar < inputSetRangeBegin) { rlRange = rlRange->fNext; } // If the start of the range from the range list is before with // the start of the range from the unicode set, split the range list range // in two, with one part being before (wholly outside of) the unicode set // and the other containing the rest. // Then continue the loop; the post-split current range will then be skipped // over if (rlRange->fStartChar < inputSetRangeBegin) { rlRange->split(inputSetRangeBegin, *fStatus); if (U_FAILURE(*fStatus)) { return; } continue; } // Same thing at the end of the ranges... // If the end of the range from the range list doesn't coincide with // the end of the range from the unicode set, split the range list // range in two. The first part of the split range will be // wholly inside the Unicode set. if (rlRange->fEndChar > inputSetRangeEnd) { rlRange->split(inputSetRangeEnd+1, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // The current rlRange is now entirely within the UnicodeSet range. // Add this unicode set to the list of sets for this rlRange if (rlRange->fIncludesSets->indexOf(usetNode) == -1) { rlRange->fIncludesSets->addElement(usetNode, *fStatus); if (U_FAILURE(*fStatus)) { return; } } // Advance over ranges that we are finished with. if (inputSetRangeEnd == rlRange->fEndChar) { inputSetRangeIndex++; } rlRange = rlRange->fNext; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();} // // Group the above ranges, with each group consisting of one or more // ranges that are in exactly the same set of original UnicodeSets. // The groups are numbered, and these group numbers are the set of // input symbols recognized by the run-time state machine. // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input // # 2 is reserved - table column 2 is for beginning-of-input // # 3 is the first range list. // RangeDescriptor *rlSearchRange; int32_t dictGroupCount = 0; for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) { if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) { rlRange->fNum = rlSearchRange->fNum; rlRange->fIncludesDict = rlSearchRange->fIncludesDict; break; } } if (rlRange->fNum == 0) { rlRange->fFirstInGroup = true; if (rlRange->isDictionaryRange()) { rlRange->fNum = ++dictGroupCount; rlRange->fIncludesDict = true; } else { fGroupCount++; rlRange->fNum = fGroupCount+2; addValToSets(rlRange->fIncludesSets, rlRange->fNum); } } } // Move the character category numbers for any dictionary ranges up, so that they // immediately follow the non-dictionary ranges. fDictCategoriesStart = fGroupCount + 3; for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { if (rlRange->fIncludesDict) { rlRange->fNum += fDictCategoriesStart - 1; if (rlRange->fFirstInGroup) { addValToSets(rlRange->fIncludesSets, rlRange->fNum); } } } fGroupCount += dictGroupCount; // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. // Column 2 is reserved for before-the-start-input. // (This column can be optimized away later if there are no rule // references to {bof}.) // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} // Because {bof} and {eof} are not characters in the normal sense, // they don't affect the computation of the ranges or TRIE. UnicodeString eofString(u"eof"); UnicodeString bofString(u"bof"); for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { break; } UnicodeSet *inputSet = usetNode->fInputSet; if (inputSet->contains(eofString)) { addValToSet(usetNode, 1); } if (inputSet->contains(bofString)) { addValToSet(usetNode, 2); fSawBOF = TRUE; } } if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();} if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();} } // // Build the Trie table for mapping UChar32 values to the corresponding // range group number. // void RBBISetBuilder::buildTrie() { fMutableTrie = umutablecptrie_open( 0, // Initial value for all code points. 0, // Error value for out-of-range input. fStatus); for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) { umutablecptrie_setRange(fMutableTrie, range->fStartChar, // Range start range->fEndChar, // Range end (inclusive) range->fNum, // value for range fStatus); } } void RBBISetBuilder::mergeCategories(IntPair categories) { U_ASSERT(categories.first >= 1); U_ASSERT(categories.second > categories.first); U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) || (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart)); for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { int32_t rangeNum = rd->fNum; if (rangeNum == categories.second) { rd->fNum = categories.first; } else if (rangeNum > categories.second) { rd->fNum--; } } --fGroupCount; if (categories.second <= fDictCategoriesStart) { --fDictCategoriesStart; } } //----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. // //----------------------------------------------------------------------------------- int32_t RBBISetBuilder::getTrieSize() { if (U_FAILURE(*fStatus)) { return 0; } if (fTrie == nullptr) { bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie; fTrie = umutablecptrie_buildImmutable( fMutableTrie, UCPTRIE_TYPE_FAST, use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16, fStatus); fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus); if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { *fStatus = U_ZERO_ERROR; } } return fTrieSize; } //----------------------------------------------------------------------------------- // // serializeTrie() Put the serialized trie at the specified address. // Trust the caller to have given us enough memory. // getTrieSize() MUST be called first. // //----------------------------------------------------------------------------------- void RBBISetBuilder::serializeTrie(uint8_t *where) { ucptrie_toBinary(fTrie, where, // Buffer fTrieSize, // Capacity fStatus); } //------------------------------------------------------------------------ // // addValToSets Add a runtime-mapped input value to each uset from a // list of uset nodes. (val corresponds to a state table column.) // For each of the original Unicode sets - which correspond // directly to uset nodes - a logically equivalent expression // is constructed in terms of the remapped runtime input // symbol set. This function adds one runtime input symbol to // a list of sets. // // The "logically equivalent expression" is the tree for an // or-ing together of all of the symbols that go into the set. // //------------------------------------------------------------------------ void RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) { int32_t ix; for (ix=0; ix<sets->size(); ix++) { RBBINode *usetNode = (RBBINode *)sets->elementAt(ix); addValToSet(usetNode, val); } } void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { RBBINode *leafNode = new RBBINode(RBBINode::leafChar); if (leafNode == NULL) { *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } leafNode->fVal = (unsigned short)val; if (usetNode->fLeftChild == NULL) { usetNode->fLeftChild = leafNode; leafNode->fParent = usetNode; } else { // There are already input symbols present for this set. // Set up an OR node, with the previous stuff as the left child // and the new value as the right child. RBBINode *orNode = new RBBINode(RBBINode::opOr); if (orNode == NULL) { *fStatus = U_MEMORY_ALLOCATION_ERROR; return; } orNode->fLeftChild = usetNode->fLeftChild; orNode->fRightChild = leafNode; orNode->fLeftChild->fParent = orNode; orNode->fRightChild->fParent = orNode; usetNode->fLeftChild = orNode; orNode->fParent = usetNode; } } //------------------------------------------------------------------------ // // getNumCharCategories // //------------------------------------------------------------------------ int32_t RBBISetBuilder::getNumCharCategories() const { return fGroupCount + 3; } //------------------------------------------------------------------------ // // getDictCategoriesStart // //------------------------------------------------------------------------ int32_t RBBISetBuilder::getDictCategoriesStart() const { return fDictCategoriesStart; } //------------------------------------------------------------------------ // // sawBOF // //------------------------------------------------------------------------ UBool RBBISetBuilder::sawBOF() const { return fSawBOF; } //------------------------------------------------------------------------ // // getFirstChar Given a runtime RBBI character category, find // the first UChar32 that is in the set of chars // in the category. //------------------------------------------------------------------------ UChar32 RBBISetBuilder::getFirstChar(int32_t category) const { RangeDescriptor *rlRange; UChar32 retVal = (UChar32)-1; for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { if (rlRange->fNum == category) { retVal = rlRange->fStartChar; break; } } return retVal; } //------------------------------------------------------------------------ // // printRanges A debugging function. // dump out all of the range definitions. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printRanges() { RangeDescriptor *rlRange; int i; RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n"); for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar); for (i=0; i<rlRange->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName {u"anon"}; RBBINode *setRef = usetNode->fParent; if (setRef != nullptr) { RBBINode *varRef = setRef->fParent; if (varRef != nullptr && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } RBBIDebugPrintf("\n"); } } #endif //------------------------------------------------------------------------ // // printRangeGroups A debugging function. // dump out all of the range groups. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printRangeGroups() { int i; RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n"); for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) { if (rlRange->fFirstInGroup) { int groupNum = rlRange->fNum; RBBIDebugPrintf("%2i ", groupNum); if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");} for (i=0; i<rlRange->fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); UnicodeString setName = UNICODE_STRING("anon", 4); RBBINode *setRef = usetNode->fParent; if (setRef != NULL) { RBBINode *varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); } i = 0; for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) { if (tRange->fNum == rlRange->fNum) { if (i++ % 5 == 0) { RBBIDebugPrintf("\n "); } RBBIDebugPrintf(" %05x-%05x", tRange->fStartChar, tRange->fEndChar); } } RBBIDebugPrintf("\n"); } } RBBIDebugPrintf("\n"); } #endif //------------------------------------------------------------------------ // // printSets A debugging function. // dump out all of the set definitions. // //------------------------------------------------------------------------ #ifdef RBBI_DEBUG void RBBISetBuilder::printSets() { int i; RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n"); for (i=0; ; i++) { RBBINode *usetNode; RBBINode *setRef; RBBINode *varRef; UnicodeString setName; usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i); if (usetNode == NULL) { break; } RBBIDebugPrintf("%3d ", i); setName = UNICODE_STRING("anonymous", 9); setRef = usetNode->fParent; if (setRef != NULL) { varRef = setRef->fParent; if (varRef != NULL && varRef->fType == RBBINode::varRef) { setName = varRef->fText; } } RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" "); RBBI_DEBUG_printUnicodeString(usetNode->fText); RBBIDebugPrintf("\n"); if (usetNode->fLeftChild != NULL) { RBBINode::printTree(usetNode->fLeftChild, TRUE); } } RBBIDebugPrintf("\n"); } #endif //------------------------------------------------------------------------------------- // // RangeDescriptor copy constructor // //------------------------------------------------------------------------------------- RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) : fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum}, fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} { if (U_FAILURE(status)) { return; } fIncludesSets = new UVector(status); if (this->fIncludesSets == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } if (U_FAILURE(status)) { return; } for (int32_t i=0; i<other.fIncludesSets->size(); i++) { this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status); } } //------------------------------------------------------------------------------------- // // RangeDesriptor default constructor // //------------------------------------------------------------------------------------- RangeDescriptor::RangeDescriptor(UErrorCode &status) { if (U_FAILURE(status)) { return; } fIncludesSets = new UVector(status); if (fIncludesSets == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } } //------------------------------------------------------------------------------------- // // RangeDesriptor Destructor // //------------------------------------------------------------------------------------- RangeDescriptor::~RangeDescriptor() { delete fIncludesSets; fIncludesSets = nullptr; } //------------------------------------------------------------------------------------- // // RangeDesriptor::split() // //------------------------------------------------------------------------------------- void RangeDescriptor::split(UChar32 where, UErrorCode &status) { U_ASSERT(where>fStartChar && where<=fEndChar); RangeDescriptor *nr = new RangeDescriptor(*this, status); if(nr == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (U_FAILURE(status)) { delete nr; return; } // RangeDescriptor copy constructor copies all fields. // Only need to update those that are different after the split. nr->fStartChar = where; this->fEndChar = where-1; nr->fNext = this->fNext; this->fNext = nr; } //------------------------------------------------------------------------------------- // // RangeDescriptor::isDictionaryRange // // Test whether this range includes characters from // the original Unicode Set named "dictionary". // // This function looks through the Unicode Sets that // the range includes, checking for one named "dictionary" // // TODO: a faster way would be to find the set node for // "dictionary" just once, rather than looking it // up by name every time. // //------------------------------------------------------------------------------------- bool RangeDescriptor::isDictionaryRange() { static const char16_t *dictionary = u"dictionary"; for (int32_t i=0; i<fIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i); RBBINode *setRef = usetNode->fParent; if (setRef != nullptr) { RBBINode *varRef = setRef->fParent; if (varRef && varRef->fType == RBBINode::varRef) { const UnicodeString *setName = &varRef->fText; if (setName->compare(dictionary, -1) == 0) { return true; } } } } return false; } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */