diff options
author | bruvzg <7645683+bruvzg@users.noreply.github.com> | 2021-10-28 09:15:28 +0300 |
---|---|---|
committer | bruvzg <7645683+bruvzg@users.noreply.github.com> | 2021-10-28 09:15:28 +0300 |
commit | 44a241b241af6453d4459c79b1a562c447e36636 (patch) | |
tree | 0b90326241b6e647b9f8991cddeee5e8d4d93e9f /thirdparty/icu4c/common/uniset.cpp | |
parent | 157cba39331c5ca945c8c3bb2173c5363550a680 (diff) |
ICU: Update to version 70.1
Diffstat (limited to 'thirdparty/icu4c/common/uniset.cpp')
-rw-r--r-- | thirdparty/icu4c/common/uniset.cpp | 123 |
1 files changed, 66 insertions, 57 deletions
diff --git a/thirdparty/icu4c/common/uniset.cpp b/thirdparty/icu4c/common/uniset.cpp index 461e5a7197..92a81a1a02 100644 --- a/thirdparty/icu4c/common/uniset.cpp +++ b/thirdparty/icu4c/common/uniset.cpp @@ -111,7 +111,7 @@ static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); } -static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { +static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { const UnicodeString &a = *(const UnicodeString*)t1.pointer; const UnicodeString &b = *(const UnicodeString*)t2.pointer; return a.compare(b); @@ -278,14 +278,14 @@ UnicodeSet *UnicodeSet::cloneAsThawed() const { * @param o set to be compared for equality with this set. * @return <tt>true</tt> if the specified set is equal to this set. */ -UBool UnicodeSet::operator==(const UnicodeSet& o) const { - if (len != o.len) return FALSE; +bool UnicodeSet::operator==(const UnicodeSet& o) const { + if (len != o.len) return false; for (int32_t i = 0; i < len; ++i) { - if (list[i] != o.list[i]) return FALSE; + if (list[i] != o.list[i]) return false; } - if (hasStrings() != o.hasStrings()) { return FALSE; } - if (hasStrings() && *strings != *o.strings) return FALSE; - return TRUE; + if (hasStrings() != o.hasStrings()) { return false; } + if (hasStrings() && *strings != *o.strings) return false; + return true; } /** @@ -984,7 +984,6 @@ void UnicodeSet::_add(const UnicodeString& s) { strings->sortedInsert(t, compareUnicodeString, ec); if (U_FAILURE(ec)) { setToBogus(); - delete t; } } @@ -1968,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) * Append the <code>toPattern()</code> representation of a * string to the given <code>StringBuffer</code>. */ -void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool -escapeUnprintable) { +void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) { UChar32 cp; for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); @@ -1980,14 +1978,12 @@ escapeUnprintable) { * Append the <code>toPattern()</code> representation of a * character to the given <code>StringBuffer</code>. */ -void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool -escapeUnprintable) { - if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { +void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) { + if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything // unprintable - if (ICU_Utility::escapeUnprintable(buf, c)) { - return; - } + ICU_Utility::escape(buf, c); + return; } // Okay to let ':' pass through switch (c) { @@ -2013,6 +2009,19 @@ escapeUnprintable) { buf.append(c); } +void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end, + UBool escapeUnprintable) { + _appendToPat(result, start, escapeUnprintable); + if (start != end) { + if ((start+1) != end || + // Avoid writing what looks like a lead+trail surrogate pair. + start == 0xdbff) { + result.append(u'-'); + } + _appendToPat(result, end, escapeUnprintable); + } +} + /** * Append a string representation of this set to result. This will be * a cleaned version of the string passed to applyPattern(), if there @@ -2027,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, for (i=0; i<patLen; ) { UChar32 c; U16_NEXT(pat, i, patLen, c); - if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { + if (escapeUnprintable ? + ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) { // If the unprintable character is preceded by an odd // number of backslashes, then it has been escaped. // Before unescaping it, we delete the final @@ -2035,7 +2045,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, if ((backslashCount % 2) == 1) { result.truncate(result.length() - 1); } - ICU_Utility::escapeUnprintable(result, c); + ICU_Utility::escape(result, c); backslashCount = 0; } else { result.append(c); @@ -2074,52 +2084,51 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, { result.append(u'['); -// // Check against the predefined categories. We implicitly build -// // up ALL category sets the first time toPattern() is called. -// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { -// if (*this == getCategorySet(cat)) { -// result.append(u':'); -// result.append(CATEGORY_NAMES, cat*2, 2); -// return result.append(CATEGORY_CLOSE); -// } -// } - - int32_t count = getRangeCount(); + int32_t i = 0; + int32_t limit = len & ~1; // = 2 * getRangeCount() // If the set contains at least 2 intervals and includes both // MIN_VALUE and MAX_VALUE, then the inverse representation will // be more economical. - if (count > 1 && - getRangeStart(0) == MIN_VALUE && - getRangeEnd(count-1) == MAX_VALUE) { - + // if (getRangeCount() >= 2 && + // getRangeStart(0) == MIN_VALUE && + // getRangeEnd(last) == MAX_VALUE) + // Invariant: list[len-1] == HIGH == MAX_VALUE + 1 + // If limit == len then len is even and the last range ends with MAX_VALUE. + // + // *But* do not write the inverse (complement) if there are strings. + // Since ICU 70, the '^' performs a code point complement which removes all strings. + if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) { // Emit the inverse result.append(u'^'); - - for (int32_t i = 1; i < count; ++i) { - UChar32 start = getRangeEnd(i-1)+1; - UChar32 end = getRangeStart(i)-1; - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append(u'-'); - } - _appendToPat(result, end, escapeUnprintable); + // Offsetting the inversion list index by one lets us + // iterate over the ranges of the set complement. + i = 1; + --limit; + } + + // Emit the ranges as pairs. + while (i < limit) { + UChar32 start = list[i]; // getRangeStart() + UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one + if (!(0xd800 <= end && end <= 0xdbff)) { + _appendToPat(result, start, end, escapeUnprintable); + i += 2; + } else { + // The range ends with a lead surrogate. + // Avoid writing what looks like a lead+trail surrogate pair. + // 1. Postpone ranges that start with a lead surrogate code point. + int32_t firstLead = i; + while ((i += 2) < limit && list[i] <= 0xdbff) {} + int32_t firstAfterLead = i; + // 2. Write following ranges that start with a trail surrogate code point. + while (i < limit && (start = list[i]) <= 0xdfff) { + _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable); + i += 2; } - } - } - - // Default; emit the ranges as pairs - else { - for (int32_t i = 0; i < count; ++i) { - UChar32 start = getRangeStart(i); - UChar32 end = getRangeEnd(i); - _appendToPat(result, start, escapeUnprintable); - if (start != end) { - if ((start+1) != end) { - result.append(u'-'); - } - _appendToPat(result, end, escapeUnprintable); + // 3. Now write the postponed ranges. + for (int j = firstLead; j < firstAfterLead; j += 2) { + _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable); } } } |