summaryrefslogtreecommitdiff
path: root/thirdparty/icu4c/common/uniset.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/icu4c/common/uniset.cpp')
-rw-r--r--thirdparty/icu4c/common/uniset.cpp128
1 files changed, 59 insertions, 69 deletions
diff --git a/thirdparty/icu4c/common/uniset.cpp b/thirdparty/icu4c/common/uniset.cpp
index b73d612f24..461e5a7197 100644
--- a/thirdparty/icu4c/common/uniset.cpp
+++ b/thirdparty/icu4c/common/uniset.cpp
@@ -30,24 +30,6 @@
#include "bmpset.h"
#include "unisetspan.h"
-// Define UChar constants using hex for EBCDIC compatibility
-// Used #define to reduce private static exports and memory access time.
-#define SET_OPEN ((UChar)0x005B) /*[*/
-#define SET_CLOSE ((UChar)0x005D) /*]*/
-#define HYPHEN ((UChar)0x002D) /*-*/
-#define COMPLEMENT ((UChar)0x005E) /*^*/
-#define COLON ((UChar)0x003A) /*:*/
-#define BACKSLASH ((UChar)0x005C) /*\*/
-#define INTERSECTION ((UChar)0x0026) /*&*/
-#define UPPER_U ((UChar)0x0055) /*U*/
-#define LOWER_U ((UChar)0x0075) /*u*/
-#define OPEN_BRACE ((UChar)123) /*{*/
-#define CLOSE_BRACE ((UChar)125) /*}*/
-#define UPPER_P ((UChar)0x0050) /*P*/
-#define LOWER_P ((UChar)0x0070) /*p*/
-#define UPPER_N ((UChar)78) /*N*/
-#define EQUALS ((UChar)0x003D) /*=*/
-
// HIGH_VALUE > all valid values. 110000 for codepoints
#define UNICODESET_HIGH 0x0110000
@@ -444,7 +426,6 @@ UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
* @return <tt>true</tt> if this set contains the specified string
*/
UBool UnicodeSet::contains(const UnicodeString& s) const {
- if (s.length() == 0) return FALSE;
int32_t cp = getSingleCP(s);
if (cp < 0) {
return stringsContains(s);
@@ -559,11 +540,9 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
if (hasStrings()) {
for (i=0; i<strings->size(); ++i) {
const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
- //if (s.length() == 0) {
- // // Empty strings match everything
- // return TRUE;
- //}
- // assert(s.length() != 0); // We enforce this elsewhere
+ if (s.isEmpty()) {
+ continue; // skip the empty string
+ }
UChar32 c = s.char32At(0);
if ((c & 0xFF) == v) {
return TRUE;
@@ -582,9 +561,6 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
int32_t limit,
UBool incremental) {
if (offset == limit) {
- // Strings, if any, have length != 0, so we don't worry
- // about them here. If we ever allow zero-length strings
- // we much check for them here.
if (contains(U_ETHER)) {
return incremental ? U_PARTIAL_MATCH : U_MATCH;
} else {
@@ -614,11 +590,9 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
for (i=0; i<strings->size(); ++i) {
const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
-
- //if (trial.length() == 0) {
- // return U_MATCH; // null-string always matches
- //}
- // assert(trial.length() != 0); // We ensure this elsewhere
+ if (trial.isEmpty()) {
+ continue; // skip the empty string
+ }
UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
@@ -971,12 +945,12 @@ UnicodeSet& UnicodeSet::add(UChar32 c) {
* present. If this set already contains the multicharacter,
* the call leaves this set unchanged.
* Thus "ch" => {"ch"}
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
* @param s the source string
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
- if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+ if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (!stringsContains(s)) {
@@ -991,8 +965,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
/**
* Adds the given string, in order, to 'strings'. The given string
- * must have been checked by the caller to not be empty and to not
- * already be in 'strings'.
+ * must have been checked by the caller to not already be in 'strings'.
*/
void UnicodeSet::_add(const UnicodeString& s) {
if (isFrozen() || isBogus()) {
@@ -1021,16 +994,13 @@ void UnicodeSet::_add(const UnicodeString& s) {
* @param string to test
*/
int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
- //if (s.length() < 1) {
- // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
- //}
- if (s.length() > 2) return -1;
- if (s.length() == 1) return s.charAt(0);
-
- // at this point, len = 2
- UChar32 cp = s.char32At(0);
- if (cp > 0xFFFF) { // is surrogate pair
- return cp;
+ int32_t sLength = s.length();
+ if (sLength == 1) return s.charAt(0);
+ if (sLength == 2) {
+ UChar32 cp = s.char32At(0);
+ if (cp > 0xFFFF) { // is surrogate pair
+ return cp;
+ }
}
return -1;
}
@@ -1150,6 +1120,26 @@ UnicodeSet& UnicodeSet::retain(UChar32 c) {
return retain(c, c);
}
+UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
+ if (isFrozen() || isBogus()) { return *this; }
+ UChar32 cp = getSingleCP(s);
+ if (cp < 0) {
+ bool isIn = stringsContains(s);
+ // Check for getRangeCount() first to avoid somewhat-expensive size()
+ // when there are single code points.
+ if (isIn && getRangeCount() == 0 && size() == 1) {
+ return *this;
+ }
+ clear();
+ if (isIn) {
+ _add(s);
+ }
+ } else {
+ retain(cp, cp);
+ }
+ return *this;
+}
+
/**
* Removes the specified range from this set if it is present.
* The set will not contain the specified range once the call
@@ -1186,7 +1176,7 @@ UnicodeSet& UnicodeSet::remove(UChar32 c) {
* @return the modified set, for chaining
*/
UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
- if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+ if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (strings != nullptr && strings->removeElement((void*) &s)) {
@@ -1252,12 +1242,12 @@ UnicodeSet& UnicodeSet::complement(void) {
* Complement the specified string in this set.
* The set will not contain the specified string once the call
* returns.
- * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
+ *
* @param s the string to complement
* @return this object, for chaining
*/
UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
- if (s.length() == 0 || isFrozen() || isBogus()) return *this;
+ if (isFrozen() || isBogus()) return *this;
int32_t cp = getSingleCP(s);
if (cp < 0) {
if (stringsContains(s)) {
@@ -2001,22 +1991,22 @@ escapeUnprintable) {
}
// Okay to let ':' pass through
switch (c) {
- case SET_OPEN:
- case SET_CLOSE:
- case HYPHEN:
- case COMPLEMENT:
- case INTERSECTION:
- case BACKSLASH:
- case OPEN_BRACE:
- case CLOSE_BRACE:
- case COLON:
+ case u'[':
+ case u']':
+ case u'-':
+ case u'^':
+ case u'&':
+ case u'\\':
+ case u'{':
+ case u'}':
+ case u':':
case SymbolTable::SYMBOL_REF:
- buf.append(BACKSLASH);
+ buf.append(u'\\');
break;
default:
// Escape whitespace
if (PatternProps::isWhiteSpace(c)) {
- buf.append(BACKSLASH);
+ buf.append(u'\\');
}
break;
}
@@ -2049,7 +2039,7 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
backslashCount = 0;
} else {
result.append(c);
- if (c == BACKSLASH) {
+ if (c == u'\\') {
++backslashCount;
} else {
backslashCount = 0;
@@ -2082,13 +2072,13 @@ UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
UBool escapeUnprintable) const
{
- result.append(SET_OPEN);
+ result.append(u'[');
// // Check against the predefined categories. We implicitly build
// // up ALL category sets the first time toPattern() is called.
// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
// if (*this == getCategorySet(cat)) {
-// result.append(COLON);
+// result.append(u':');
// result.append(CATEGORY_NAMES, cat*2, 2);
// return result.append(CATEGORY_CLOSE);
// }
@@ -2104,7 +2094,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
getRangeEnd(count-1) == MAX_VALUE) {
// Emit the inverse
- result.append(COMPLEMENT);
+ result.append(u'^');
for (int32_t i = 1; i < count; ++i) {
UChar32 start = getRangeEnd(i-1)+1;
@@ -2112,7 +2102,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
- result.append(HYPHEN);
+ result.append(u'-');
}
_appendToPat(result, end, escapeUnprintable);
}
@@ -2127,7 +2117,7 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
_appendToPat(result, start, escapeUnprintable);
if (start != end) {
if ((start+1) != end) {
- result.append(HYPHEN);
+ result.append(u'-');
}
_appendToPat(result, end, escapeUnprintable);
}
@@ -2136,14 +2126,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
if (strings != nullptr) {
for (int32_t i = 0; i<strings->size(); ++i) {
- result.append(OPEN_BRACE);
+ result.append(u'{');
_appendToPat(result,
*(const UnicodeString*) strings->elementAt(i),
escapeUnprintable);
- result.append(CLOSE_BRACE);
+ result.append(u'}');
}
}
- return result.append(SET_CLOSE);
+ return result.append(u']');
}
/**