summaryrefslogtreecommitdiff
path: root/thirdparty/icu4c/common/uniset.cpp
diff options
context:
space:
mode:
authorbruvzg <7645683+bruvzg@users.noreply.github.com>2021-10-28 09:15:28 +0300
committerbruvzg <7645683+bruvzg@users.noreply.github.com>2021-10-28 09:15:28 +0300
commit44a241b241af6453d4459c79b1a562c447e36636 (patch)
tree0b90326241b6e647b9f8991cddeee5e8d4d93e9f /thirdparty/icu4c/common/uniset.cpp
parent157cba39331c5ca945c8c3bb2173c5363550a680 (diff)
ICU: Update to version 70.1
Diffstat (limited to 'thirdparty/icu4c/common/uniset.cpp')
-rw-r--r--thirdparty/icu4c/common/uniset.cpp123
1 files changed, 66 insertions, 57 deletions
diff --git a/thirdparty/icu4c/common/uniset.cpp b/thirdparty/icu4c/common/uniset.cpp
index 461e5a7197..92a81a1a02 100644
--- a/thirdparty/icu4c/common/uniset.cpp
+++ b/thirdparty/icu4c/common/uniset.cpp
@@ -111,7 +111,7 @@ static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
}
-static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
+static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
const UnicodeString &a = *(const UnicodeString*)t1.pointer;
const UnicodeString &b = *(const UnicodeString*)t2.pointer;
return a.compare(b);
@@ -278,14 +278,14 @@ UnicodeSet *UnicodeSet::cloneAsThawed() const {
* @param o set to be compared for equality with this set.
* @return <tt>true</tt> if the specified set is equal to this set.
*/
-UBool UnicodeSet::operator==(const UnicodeSet& o) const {
- if (len != o.len) return FALSE;
+bool UnicodeSet::operator==(const UnicodeSet& o) const {
+ if (len != o.len) return false;
for (int32_t i = 0; i < len; ++i) {
- if (list[i] != o.list[i]) return FALSE;
+ if (list[i] != o.list[i]) return false;
}
- if (hasStrings() != o.hasStrings()) { return FALSE; }
- if (hasStrings() && *strings != *o.strings) return FALSE;
- return TRUE;
+ if (hasStrings() != o.hasStrings()) { return false; }
+ if (hasStrings() && *strings != *o.strings) return false;
+ return true;
}
/**
@@ -984,7 +984,6 @@ void UnicodeSet::_add(const UnicodeString& s) {
strings->sortedInsert(t, compareUnicodeString, ec);
if (U_FAILURE(ec)) {
setToBogus();
- delete t;
}
}
@@ -1968,8 +1967,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity)
* Append the <code>toPattern()</code> representation of a
* string to the given <code>StringBuffer</code>.
*/
-void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
-escapeUnprintable) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
UChar32 cp;
for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
_appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
@@ -1980,14 +1978,12 @@ escapeUnprintable) {
* Append the <code>toPattern()</code> representation of a
* character to the given <code>StringBuffer</code>.
*/
-void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
-escapeUnprintable) {
- if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
+ if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
// unprintable
- if (ICU_Utility::escapeUnprintable(buf, c)) {
- return;
- }
+ ICU_Utility::escape(buf, c);
+ return;
}
// Okay to let ':' pass through
switch (c) {
@@ -2013,6 +2009,19 @@ escapeUnprintable) {
buf.append(c);
}
+void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
+ UBool escapeUnprintable) {
+ _appendToPat(result, start, escapeUnprintable);
+ if (start != end) {
+ if ((start+1) != end ||
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ start == 0xdbff) {
+ result.append(u'-');
+ }
+ _appendToPat(result, end, escapeUnprintable);
+ }
+}
+
/**
* Append a string representation of this set to result. This will be
* a cleaned version of the string passed to applyPattern(), if there
@@ -2027,7 +2036,8 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
for (i=0; i<patLen; ) {
UChar32 c;
U16_NEXT(pat, i, patLen, c);
- if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
+ if (escapeUnprintable ?
+ ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
// If the unprintable character is preceded by an odd
// number of backslashes, then it has been escaped.
// Before unescaping it, we delete the final
@@ -2035,7 +2045,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
if ((backslashCount % 2) == 1) {
result.truncate(result.length() - 1);
}
- ICU_Utility::escapeUnprintable(result, c);
+ ICU_Utility::escape(result, c);
backslashCount = 0;
} else {
result.append(c);
@@ -2074,52 +2084,51 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
{
result.append(u'[');
-// // Check against the predefined categories. We implicitly build
-// // up ALL category sets the first time toPattern() is called.
-// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
-// if (*this == getCategorySet(cat)) {
-// result.append(u':');
-// result.append(CATEGORY_NAMES, cat*2, 2);
-// return result.append(CATEGORY_CLOSE);
-// }
-// }
-
- int32_t count = getRangeCount();
+ int32_t i = 0;
+ int32_t limit = len & ~1; // = 2 * getRangeCount()
// If the set contains at least 2 intervals and includes both
// MIN_VALUE and MAX_VALUE, then the inverse representation will
// be more economical.
- if (count > 1 &&
- getRangeStart(0) == MIN_VALUE &&
- getRangeEnd(count-1) == MAX_VALUE) {
-
+ // if (getRangeCount() >= 2 &&
+ // getRangeStart(0) == MIN_VALUE &&
+ // getRangeEnd(last) == MAX_VALUE)
+ // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
+ // If limit == len then len is even and the last range ends with MAX_VALUE.
+ //
+ // *But* do not write the inverse (complement) if there are strings.
+ // Since ICU 70, the '^' performs a code point complement which removes all strings.
+ if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
// Emit the inverse
result.append(u'^');
-
- for (int32_t i = 1; i < count; ++i) {
- UChar32 start = getRangeEnd(i-1)+1;
- UChar32 end = getRangeStart(i)-1;
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append(u'-');
- }
- _appendToPat(result, end, escapeUnprintable);
+ // Offsetting the inversion list index by one lets us
+ // iterate over the ranges of the set complement.
+ i = 1;
+ --limit;
+ }
+
+ // Emit the ranges as pairs.
+ while (i < limit) {
+ UChar32 start = list[i]; // getRangeStart()
+ UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
+ if (!(0xd800 <= end && end <= 0xdbff)) {
+ _appendToPat(result, start, end, escapeUnprintable);
+ i += 2;
+ } else {
+ // The range ends with a lead surrogate.
+ // Avoid writing what looks like a lead+trail surrogate pair.
+ // 1. Postpone ranges that start with a lead surrogate code point.
+ int32_t firstLead = i;
+ while ((i += 2) < limit && list[i] <= 0xdbff) {}
+ int32_t firstAfterLead = i;
+ // 2. Write following ranges that start with a trail surrogate code point.
+ while (i < limit && (start = list[i]) <= 0xdfff) {
+ _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
+ i += 2;
}
- }
- }
-
- // Default; emit the ranges as pairs
- else {
- for (int32_t i = 0; i < count; ++i) {
- UChar32 start = getRangeStart(i);
- UChar32 end = getRangeEnd(i);
- _appendToPat(result, start, escapeUnprintable);
- if (start != end) {
- if ((start+1) != end) {
- result.append(u'-');
- }
- _appendToPat(result, end, escapeUnprintable);
+ // 3. Now write the postponed ranges.
+ for (int j = firstLead; j < firstAfterLead; j += 2) {
+ _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
}
}
}