summaryrefslogtreecommitdiff
path: root/thirdparty/icu4c/common/normalizer2impl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/icu4c/common/normalizer2impl.cpp')
-rw-r--r--thirdparty/icu4c/common/normalizer2impl.cpp152
1 files changed, 143 insertions, 9 deletions
diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp
index cbf6b4d980..c0ad5c69f3 100644
--- a/thirdparty/icu4c/common/normalizer2impl.cpp
+++ b/thirdparty/icu4c/common/normalizer2impl.cpp
@@ -731,9 +731,131 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
}
+// Dual functionality:
+// sink != nullptr: normalize
+// sink == nullptr: isNormalized/spanQuickCheckYes
+const uint8_t *
+Normalizer2Impl::decomposeUTF8(uint32_t options,
+ const uint8_t *src, const uint8_t *limit,
+ ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
+ U_ASSERT(limit != nullptr);
+ UnicodeString s16;
+ uint8_t minNoLead = leadByteForCP(minDecompNoCP);
+
+ const uint8_t *prevBoundary = src;
+ // only for quick check
+ uint8_t prevCC = 0;
+
+ for (;;) {
+ // Fast path: Scan over a sequence of characters below the minimum "no" code point,
+ // or with (decompYes && ccc==0) properties.
+ const uint8_t *fastStart = src;
+ const uint8_t *prevSrc;
+ uint16_t norm16 = 0;
+
+ for (;;) {
+ if (src == limit) {
+ if (prevBoundary != limit && sink != nullptr) {
+ ByteSinkUtil::appendUnchanged(prevBoundary, limit,
+ *sink, options, edits, errorCode);
+ }
+ return src;
+ }
+ if (*src < minNoLead) {
+ ++src;
+ } else {
+ prevSrc = src;
+ UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
+ if (!isMostDecompYesAndZeroCC(norm16)) {
+ break;
+ }
+ }
+ }
+ // isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
+ // and the current character at [prevSrc..src[ is not a common case with cc=0
+ // (MIN_NORMAL_MAYBE_YES or JAMO_VT).
+ // It could still be a maybeYes with cc=0.
+ if (prevSrc != fastStart) {
+ // The fast path looped over yes/0 characters before the current one.
+ if (sink != nullptr &&
+ !ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
+ *sink, options, edits, errorCode)) {
+ break;
+ }
+ prevBoundary = prevSrc;
+ prevCC = 0;
+ }
+
+ // Medium-fast path: Quick check.
+ if (isMaybeOrNonZeroCC(norm16)) {
+ // Does not decompose.
+ uint8_t cc = getCCFromYesOrMaybe(norm16);
+ if (prevCC <= cc || cc == 0) {
+ prevCC = cc;
+ if (cc <= 1) {
+ if (sink != nullptr &&
+ !ByteSinkUtil::appendUnchanged(prevBoundary, src,
+ *sink, options, edits, errorCode)) {
+ break;
+ }
+ prevBoundary = src;
+ }
+ continue;
+ }
+ }
+ if (sink == nullptr) {
+ return prevBoundary; // quick check: "no" or cc out of order
+ }
+
+ // Slow path
+ // Decompose up to and including the current character.
+ if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
+ if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
+ *sink, options, edits, errorCode)) {
+ break;
+ }
+ prevBoundary = prevSrc;
+ }
+ ReorderingBuffer buffer(*this, s16, errorCode);
+ if (U_FAILURE(errorCode)) {
+ break;
+ }
+ decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,
+ buffer, errorCode);
+ // Decompose until the next boundary.
+ if (buffer.getLastCC() > 1) {
+ src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,
+ buffer, errorCode);
+ }
+ if (U_FAILURE(errorCode)) {
+ break;
+ }
+ if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ break;
+ }
+ // We already know there was a change if the original character decomposed;
+ // otherwise compare.
+ if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
+ if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
+ *sink, options, edits, errorCode)) {
+ break;
+ }
+ } else {
+ if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
+ *sink, edits, errorCode)) {
+ break;
+ }
+ }
+ prevBoundary = src;
+ prevCC = 0;
+ }
+ return src;
+}
+
const uint8_t *
Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
- UBool stopAtCompBoundary, UBool onlyContiguous,
+ StopAt stopAt, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return nullptr;
@@ -746,21 +868,28 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
UChar32 c = U_SENTINEL;
if (norm16 >= limitNoNo) {
if (isMaybeOrNonZeroCC(norm16)) {
- // No boundaries around this character.
+ // No comp boundaries around this character.
+ uint8_t cc = getCCFromYesOrMaybe(norm16);
+ if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
+ return prevSrc;
+ }
c = codePointFromValidUTF8(prevSrc, src);
- if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
+ if (!buffer.append(c, cc, errorCode)) {
return nullptr;
}
+ if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
+ return src;
+ }
continue;
}
// Maps to an isCompYesAndZeroCC.
- if (stopAtCompBoundary) {
+ if (stopAt != STOP_AT_LIMIT) {
return prevSrc;
}
c = codePointFromValidUTF8(prevSrc, src);
c = mapAlgorithmic(c, norm16);
norm16 = getRawNorm16(c);
- } else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
+ } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
return prevSrc;
}
// norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
@@ -768,7 +897,8 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
// its norm16==INERT is normalization-inert,
// so it gets copied unchanged in the fast path,
// and we stop the slow path where invalid UTF-8 begins.
- U_ASSERT(norm16 != INERT);
+ // c >= 0 is the result of an algorithmic mapping.
+ U_ASSERT(c >= 0 || norm16 != INERT);
if (norm16 < minYesNo) {
if (c < 0) {
c = codePointFromValidUTF8(prevSrc, src);
@@ -798,11 +928,15 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
} else {
leadCC = 0;
}
+ if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
+ return prevSrc;
+ }
if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
return nullptr;
}
}
- if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
+ if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
+ (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
return src;
}
}
@@ -1954,10 +2088,10 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
break;
}
// We know there is not a boundary here.
- decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
+ decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
buffer, errorCode);
// Decompose until the next boundary.
- src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
+ src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
buffer, errorCode);
if (U_FAILURE(errorCode)) {
break;