// © 2017 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // edits.cpp // created: 2017feb08 Markus W. Scherer #include "unicode/edits.h" #include "unicode/unistr.h" #include "unicode/utypes.h" #include "cmemory.h" #include "uassert.h" #include "util.h" U_NAMESPACE_BEGIN namespace { // 0000uuuuuuuuuuuu records u+1 unchanged text units. const int32_t MAX_UNCHANGED_LENGTH = 0x1000; const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units. const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6; const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7; const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff; const int32_t MAX_SHORT_CHANGE = 0x6fff; // 0111mmmmmmnnnnnn records a replacement of m text units with n. // m or n = 61: actual length follows in the next edits array unit. // m or n = 62..63: actual length follows in the next two edits array units. // Bit 30 of the actual length is in the head unit. // Trailing units have bit 15 set. const int32_t LENGTH_IN_1TRAIL = 61; const int32_t LENGTH_IN_2TRAIL = 62; } // namespace void Edits::releaseArray() U_NOEXCEPT { if (array != stackArray) { uprv_free(array); } } Edits &Edits::copyArray(const Edits &other) { if (U_FAILURE(errorCode_)) { length = delta = numChanges = 0; return *this; } if (length > capacity) { uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)length * 2); if (newArray == nullptr) { length = delta = numChanges = 0; errorCode_ = U_MEMORY_ALLOCATION_ERROR; return *this; } releaseArray(); array = newArray; capacity = length; } if (length > 0) { uprv_memcpy(array, other.array, (size_t)length * 2); } return *this; } Edits &Edits::moveArray(Edits &src) U_NOEXCEPT { if (U_FAILURE(errorCode_)) { length = delta = numChanges = 0; return *this; } releaseArray(); if (length > STACK_CAPACITY) { array = src.array; capacity = src.capacity; src.array = src.stackArray; src.capacity = STACK_CAPACITY; src.reset(); return *this; } array = stackArray; capacity = STACK_CAPACITY; if (length > 0) { uprv_memcpy(array, src.array, (size_t)length * 2); } return *this; } Edits &Edits::operator=(const Edits &other) { if (this == &other) { return *this; } // self-assignment: no-op length = other.length; delta = other.delta; numChanges = other.numChanges; errorCode_ = other.errorCode_; return copyArray(other); } Edits &Edits::operator=(Edits &&src) U_NOEXCEPT { length = src.length; delta = src.delta; numChanges = src.numChanges; errorCode_ = src.errorCode_; return moveArray(src); } Edits::~Edits() { releaseArray(); } void Edits::reset() U_NOEXCEPT { length = delta = numChanges = 0; errorCode_ = U_ZERO_ERROR; } void Edits::addUnchanged(int32_t unchangedLength) { if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; } if(unchangedLength < 0) { errorCode_ = U_ILLEGAL_ARGUMENT_ERROR; return; } // Merge into previous unchanged-text record, if any. int32_t last = lastUnit(); if(last < MAX_UNCHANGED) { int32_t remaining = MAX_UNCHANGED - last; if (remaining >= unchangedLength) { setLastUnit(last + unchangedLength); return; } setLastUnit(MAX_UNCHANGED); unchangedLength -= remaining; } // Split large lengths into multiple units. while(unchangedLength >= MAX_UNCHANGED_LENGTH) { append(MAX_UNCHANGED); unchangedLength -= MAX_UNCHANGED_LENGTH; } // Write a small (remaining) length. if(unchangedLength > 0) { append(unchangedLength - 1); } } void Edits::addReplace(int32_t oldLength, int32_t newLength) { if(U_FAILURE(errorCode_)) { return; } if(oldLength < 0 || newLength < 0) { errorCode_ = U_ILLEGAL_ARGUMENT_ERROR; return; } if (oldLength == 0 && newLength == 0) { return; } ++numChanges; int32_t newDelta = newLength - oldLength; if (newDelta != 0) { if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) || (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) { // Integer overflow or underflow. errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; return; } delta += newDelta; } if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH && newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) { // Merge into previous same-lengths short-replacement record, if any. int32_t u = (oldLength << 12) | (newLength << 9); int32_t last = lastUnit(); if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && (last & ~SHORT_CHANGE_NUM_MASK) == u && (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) { setLastUnit(last + 1); return; } append(u); return; } int32_t head = 0x7000; if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { head |= oldLength << 6; head |= newLength; append(head); } else if ((capacity - length) >= 5 || growArray()) { int32_t limit = length + 1; if(oldLength < LENGTH_IN_1TRAIL) { head |= oldLength << 6; } else if(oldLength <= 0x7fff) { head |= LENGTH_IN_1TRAIL << 6; array[limit++] = (uint16_t)(0x8000 | oldLength); } else { head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15)); array[limit++] = (uint16_t)(0x8000 | oldLength); } if(newLength < LENGTH_IN_1TRAIL) { head |= newLength; } else if(newLength <= 0x7fff) { head |= LENGTH_IN_1TRAIL; array[limit++] = (uint16_t)(0x8000 | newLength); } else { head |= LENGTH_IN_2TRAIL + (newLength >> 30); array[limit++] = (uint16_t)(0x8000 | (newLength >> 15)); array[limit++] = (uint16_t)(0x8000 | newLength); } array[length] = (uint16_t)head; length = limit; } } void Edits::append(int32_t r) { if(length < capacity || growArray()) { array[length++] = (uint16_t)r; } } UBool Edits::growArray() { int32_t newCapacity; if (array == stackArray) { newCapacity = 2000; } else if (capacity == INT32_MAX) { // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API // with a result-string-buffer overflow. errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; return false; } else if (capacity >= (INT32_MAX / 2)) { newCapacity = INT32_MAX; } else { newCapacity = 2 * capacity; } // Grow by at least 5 units so that a maximal change record will fit. if ((newCapacity - capacity) < 5) { errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR; return false; } uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2); if (newArray == NULL) { errorCode_ = U_MEMORY_ALLOCATION_ERROR; return false; } uprv_memcpy(newArray, array, (size_t)length * 2); releaseArray(); array = newArray; capacity = newCapacity; return true; } UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const { if (U_FAILURE(outErrorCode)) { return true; } if (U_SUCCESS(errorCode_)) { return false; } outErrorCode = errorCode_; return true; } Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) { if (copyErrorTo(errorCode)) { return *this; } // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c. // Parallel iteration over both Edits. Iterator abIter = ab.getFineIterator(); Iterator bcIter = bc.getFineIterator(); UBool abHasNext = true, bcHasNext = true; // Copy iterator state into local variables, so that we can modify and subdivide spans. // ab old & new length, bc old & new length int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0; // When we have different-intermediate-length changes, we accumulate a larger change. int32_t pending_aLength = 0, pending_cLength = 0; for (;;) { // At this point, for each of the two iterators: // Either we are done with the locally cached current edit, // and its intermediate-string length has been reset, // or we will continue to work with a truncated remainder of this edit. // // If the current edit is done, and the iterator has not yet reached the end, // then we fetch the next edit. This is true for at least one of the iterators. // // Normally it does not matter whether we fetch from ab and then bc or vice versa. // However, the result is observably different when // ab deletions meet bc insertions at the same intermediate-string index. // Some users expect the bc insertions to come first, so we fetch from bc first. if (bc_bLength == 0) { if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) { bc_bLength = bcIter.oldLength(); cLength = bcIter.newLength(); if (bc_bLength == 0) { // insertion if (ab_bLength == 0 || !abIter.hasChange()) { addReplace(pending_aLength, pending_cLength + cLength); pending_aLength = pending_cLength = 0; } else { pending_cLength += cLength; } continue; } } // else see if the other iterator is done, too. } if (ab_bLength == 0) { if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) { aLength = abIter.oldLength(); ab_bLength = abIter.newLength(); if (ab_bLength == 0) { // deletion if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) { addReplace(pending_aLength + aLength, pending_cLength); pending_aLength = pending_cLength = 0; } else { pending_aLength += aLength; } continue; } } else if (bc_bLength == 0) { // Both iterators are done at the same time: // The intermediate-string lengths match. break; } else { // The ab output string is shorter than the bc input string. if (!copyErrorTo(errorCode)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; } return *this; } } if (bc_bLength == 0) { // The bc input string is shorter than the ab output string. if (!copyErrorTo(errorCode)) { errorCode = U_ILLEGAL_ARGUMENT_ERROR; } return *this; } // Done fetching: ab_bLength > 0 && bc_bLength > 0 // The current state has two parts: // - Past: We accumulate a longer ac edit in the "pending" variables. // - Current: We have copies of the current ab/bc edits in local variables. // At least one side is newly fetched. // One side might be a truncated remainder of an edit we fetched earlier. if (!abIter.hasChange() && !bcIter.hasChange()) { // An unchanged span all the way from string a to string c. if (pending_aLength != 0 || pending_cLength != 0) { addReplace(pending_aLength, pending_cLength); pending_aLength = pending_cLength = 0; } int32_t unchangedLength = aLength <= cLength ? aLength : cLength; addUnchanged(unchangedLength); ab_bLength = aLength -= unchangedLength; bc_bLength = cLength -= unchangedLength; // At least one of the unchanged spans is now empty. continue; } if (!abIter.hasChange() && bcIter.hasChange()) { // Unchanged a->b but changed b->c. if (ab_bLength >= bc_bLength) { // Split the longer unchanged span into change + remainder. addReplace(pending_aLength + bc_bLength, pending_cLength + cLength); pending_aLength = pending_cLength = 0; aLength = ab_bLength -= bc_bLength; bc_bLength = 0; continue; } // Handle the shorter unchanged span below like a change. } else if (abIter.hasChange() && !bcIter.hasChange()) { // Changed a->b and then unchanged b->c. if (ab_bLength <= bc_bLength) { // Split the longer unchanged span into change + remainder. addReplace(pending_aLength + aLength, pending_cLength + ab_bLength); pending_aLength = pending_cLength = 0; cLength = bc_bLength -= ab_bLength; ab_bLength = 0; continue; } // Handle the shorter unchanged span below like a change. } else { // both abIter.hasChange() && bcIter.hasChange() if (ab_bLength == bc_bLength) { // Changes on both sides up to the same position. Emit & reset. addReplace(pending_aLength + aLength, pending_cLength + cLength); pending_aLength = pending_cLength = 0; ab_bLength = bc_bLength = 0; continue; } } // Accumulate the a->c change, reset the shorter side, // keep a remainder of the longer one. pending_aLength += aLength; pending_cLength += cLength; if (ab_bLength < bc_bLength) { bc_bLength -= ab_bLength; cLength = ab_bLength = 0; } else { // ab_bLength > bc_bLength ab_bLength -= bc_bLength; aLength = bc_bLength = 0; } } if (pending_aLength != 0 || pending_cLength != 0) { addReplace(pending_aLength, pending_cLength); } copyErrorTo(errorCode); return *this; } Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) : array(a), index(0), length(len), remaining(0), onlyChanges_(oc), coarse(crs), dir(0), changed(false), oldLength_(0), newLength_(0), srcIndex(0), replIndex(0), destIndex(0) {} int32_t Edits::Iterator::readLength(int32_t head) { if (head < LENGTH_IN_1TRAIL) { return head; } else if (head < LENGTH_IN_2TRAIL) { U_ASSERT(index < length); U_ASSERT(array[index] >= 0x8000); return array[index++] & 0x7fff; } else { U_ASSERT((index + 2) <= length); U_ASSERT(array[index] >= 0x8000); U_ASSERT(array[index + 1] >= 0x8000); int32_t len = ((head & 1) << 30) | ((int32_t)(array[index] & 0x7fff) << 15) | (array[index + 1] & 0x7fff); index += 2; return len; } } void Edits::Iterator::updateNextIndexes() { srcIndex += oldLength_; if (changed) { replIndex += newLength_; } destIndex += newLength_; } void Edits::Iterator::updatePreviousIndexes() { srcIndex -= oldLength_; if (changed) { replIndex -= newLength_; } destIndex -= newLength_; } UBool Edits::Iterator::noNext() { // No change before or beyond the string. dir = 0; changed = false; oldLength_ = newLength_ = 0; return false; } UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) { // Forward iteration: Update the string indexes to the limit of the current span, // and post-increment-read array units to assemble a new span. // Leaves the array index one after the last unit of that span. if (U_FAILURE(errorCode)) { return false; } // We have an errorCode in case we need to start guarding against integer overflows. // It is also convenient for caller loops if we bail out when an error was set elsewhere. if (dir > 0) { updateNextIndexes(); } else { if (dir < 0) { // Turn around from previous() to next(). // Post-increment-read the same span again. if (remaining > 0) { // Fine-grained iterator: // Stay on the current one of a sequence of compressed changes. ++index; // next() rests on the index after the sequence unit. dir = 1; return true; } } dir = 1; } if (remaining >= 1) { // Fine-grained iterator: Continue a sequence of compressed changes. if (remaining > 1) { --remaining; return true; } remaining = 0; } if (index >= length) { return noNext(); } int32_t u = array[index++]; if (u <= MAX_UNCHANGED) { // Combine adjacent unchanged ranges. changed = false; oldLength_ = u + 1; while (index < length && (u = array[index]) <= MAX_UNCHANGED) { ++index; oldLength_ += u + 1; } newLength_ = oldLength_; if (onlyChanges) { updateNextIndexes(); if (index >= length) { return noNext(); } // already fetched u > MAX_UNCHANGED at index ++index; } else { return true; } } changed = true; if (u <= MAX_SHORT_CHANGE) { int32_t oldLen = u >> 12; int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; if (coarse) { oldLength_ = num * oldLen; newLength_ = num * newLen; } else { // Split a sequence of changes that was compressed into one unit. oldLength_ = oldLen; newLength_ = newLen; if (num > 1) { remaining = num; // This is the first of two or more changes. } return true; } } else { U_ASSERT(u <= 0x7fff); oldLength_ = readLength((u >> 6) & 0x3f); newLength_ = readLength(u & 0x3f); if (!coarse) { return true; } } // Combine adjacent changes. while (index < length && (u = array[index]) > MAX_UNCHANGED) { ++index; if (u <= MAX_SHORT_CHANGE) { int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; oldLength_ += (u >> 12) * num; newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; } else { U_ASSERT(u <= 0x7fff); oldLength_ += readLength((u >> 6) & 0x3f); newLength_ += readLength(u & 0x3f); } } return true; } UBool Edits::Iterator::previous(UErrorCode &errorCode) { // Backward iteration: Pre-decrement-read array units to assemble a new span, // then update the string indexes to the start of that span. // Leaves the array index on the head unit of that span. if (U_FAILURE(errorCode)) { return false; } // We have an errorCode in case we need to start guarding against integer overflows. // It is also convenient for caller loops if we bail out when an error was set elsewhere. if (dir >= 0) { if (dir > 0) { // Turn around from next() to previous(). // Set the string indexes to the span limit and // pre-decrement-read the same span again. if (remaining > 0) { // Fine-grained iterator: // Stay on the current one of a sequence of compressed changes. --index; // previous() rests on the sequence unit. dir = -1; return true; } updateNextIndexes(); } dir = -1; } if (remaining > 0) { // Fine-grained iterator: Continue a sequence of compressed changes. int32_t u = array[index]; U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) { ++remaining; updatePreviousIndexes(); return true; } remaining = 0; } if (index <= 0) { return noNext(); } int32_t u = array[--index]; if (u <= MAX_UNCHANGED) { // Combine adjacent unchanged ranges. changed = false; oldLength_ = u + 1; while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) { --index; oldLength_ += u + 1; } newLength_ = oldLength_; // No need to handle onlyChanges as long as previous() is called only from findIndex(). updatePreviousIndexes(); return true; } changed = true; if (u <= MAX_SHORT_CHANGE) { int32_t oldLen = u >> 12; int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; if (coarse) { oldLength_ = num * oldLen; newLength_ = num * newLen; } else { // Split a sequence of changes that was compressed into one unit. oldLength_ = oldLen; newLength_ = newLen; if (num > 1) { remaining = 1; // This is the last of two or more changes. } updatePreviousIndexes(); return true; } } else { if (u <= 0x7fff) { // The change is encoded in u alone. oldLength_ = readLength((u >> 6) & 0x3f); newLength_ = readLength(u & 0x3f); } else { // Back up to the head of the change, read the lengths, // and reset the index to the head again. U_ASSERT(index > 0); while ((u = array[--index]) > 0x7fff) {} U_ASSERT(u > MAX_SHORT_CHANGE); int32_t headIndex = index++; oldLength_ = readLength((u >> 6) & 0x3f); newLength_ = readLength(u & 0x3f); index = headIndex; } if (!coarse) { updatePreviousIndexes(); return true; } } // Combine adjacent changes. while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) { --index; if (u <= MAX_SHORT_CHANGE) { int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1; oldLength_ += (u >> 12) * num; newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; } else if (u <= 0x7fff) { // Read the lengths, and reset the index to the head again. int32_t headIndex = index++; oldLength_ += readLength((u >> 6) & 0x3f); newLength_ += readLength(u & 0x3f); index = headIndex; } } updatePreviousIndexes(); return true; } int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) { if (U_FAILURE(errorCode) || i < 0) { return -1; } int32_t spanStart, spanLength; if (findSource) { // find source index spanStart = srcIndex; spanLength = oldLength_; } else { // find destination index spanStart = destIndex; spanLength = newLength_; } if (i < spanStart) { if (i >= (spanStart / 2)) { // Search backwards. for (;;) { UBool hasPrevious = previous(errorCode); U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0 (void)hasPrevious; // avoid unused-variable warning spanStart = findSource ? srcIndex : destIndex; if (i >= spanStart) { // The index is in the current span. return 0; } if (remaining > 0) { // Is the index in one of the remaining compressed edits? // spanStart is the start of the current span, first of the remaining ones. spanLength = findSource ? oldLength_ : newLength_; int32_t u = array[index]; U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining; int32_t len = num * spanLength; if (i >= (spanStart - len)) { int32_t n = ((spanStart - i - 1) / spanLength) + 1; // 1 <= n <= num srcIndex -= n * oldLength_; replIndex -= n * newLength_; destIndex -= n * newLength_; remaining += n; return 0; } // Skip all of these edits at once. srcIndex -= num * oldLength_; replIndex -= num * newLength_; destIndex -= num * newLength_; remaining = 0; } } } // Reset the iterator to the start. dir = 0; index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; } else if (i < (spanStart + spanLength)) { // The index is in the current span. return 0; } while (next(false, errorCode)) { if (findSource) { spanStart = srcIndex; spanLength = oldLength_; } else { spanStart = destIndex; spanLength = newLength_; } if (i < (spanStart + spanLength)) { // The index is in the current span. return 0; } if (remaining > 1) { // Is the index in one of the remaining compressed edits? // spanStart is the start of the current span, first of the remaining ones. int32_t len = remaining * spanLength; if (i < (spanStart + len)) { int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1 srcIndex += n * oldLength_; replIndex += n * newLength_; destIndex += n * newLength_; remaining -= n; return 0; } // Make next() skip all of these edits at once. oldLength_ *= remaining; newLength_ *= remaining; remaining = 0; } } return 1; } int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) { int32_t where = findIndex(i, true, errorCode); if (where < 0) { // Error or before the string. return 0; } if (where > 0 || i == srcIndex) { // At or after string length, or at start of the found span. return destIndex; } if (changed) { // In a change span, map to its end. return destIndex + newLength_; } else { // In an unchanged span, offset 1:1 within it. return destIndex + (i - srcIndex); } } int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) { int32_t where = findIndex(i, false, errorCode); if (where < 0) { // Error or before the string. return 0; } if (where > 0 || i == destIndex) { // At or after string length, or at start of the found span. return srcIndex; } if (changed) { // In a change span, map to its end. return srcIndex + oldLength_; } else { // In an unchanged span, offset within it. return srcIndex + (i - destIndex); } } UnicodeString& Edits::Iterator::toString(UnicodeString& sb) const { sb.append(u"{ src[", -1); ICU_Utility::appendNumber(sb, srcIndex); sb.append(u"..", -1); ICU_Utility::appendNumber(sb, srcIndex + oldLength_); if (changed) { sb.append(u"] ⇝ dest[", -1); } else { sb.append(u"] ≡ dest[", -1); } ICU_Utility::appendNumber(sb, destIndex); sb.append(u"..", -1); ICU_Utility::appendNumber(sb, destIndex + newLength_); if (changed) { sb.append(u"], repl[", -1); ICU_Utility::appendNumber(sb, replIndex); sb.append(u"..", -1); ICU_Utility::appendNumber(sb, replIndex + newLength_); sb.append(u"] }", -1); } else { sb.append(u"] (no-change) }", -1); } return sb; } U_NAMESPACE_END