// © 2019 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // locdistance.cpp // created: 2019may08 Markus W. Scherer #include "unicode/utypes.h" #include "unicode/bytestrie.h" #include "unicode/localematcher.h" #include "unicode/locid.h" #include "unicode/uobject.h" #include "unicode/ures.h" #include "cstring.h" #include "locdistance.h" #include "loclikelysubtags.h" #include "uassert.h" #include "ucln_cmn.h" #include "uinvchar.h" #include "umutex.h" U_NAMESPACE_BEGIN namespace { /** * Bit flag used on the last character of a subtag in the trie. * Must be set consistently by the builder and the lookup code. */ constexpr int32_t END_OF_SUBTAG = 0x80; /** Distance value bit flag, set by the builder. */ constexpr int32_t DISTANCE_SKIP_SCRIPT = 0x80; /** Distance value bit flag, set by trieNext(). */ constexpr int32_t DISTANCE_IS_FINAL = 0x100; constexpr int32_t DISTANCE_IS_FINAL_OR_SKIP_SCRIPT = DISTANCE_IS_FINAL | DISTANCE_SKIP_SCRIPT; constexpr int32_t ABOVE_THRESHOLD = 100; // Indexes into array of distances. enum { IX_DEF_LANG_DISTANCE, IX_DEF_SCRIPT_DISTANCE, IX_DEF_REGION_DISTANCE, IX_MIN_REGION_DISTANCE, IX_LIMIT }; LocaleDistance *gLocaleDistance = nullptr; UInitOnce gInitOnce = U_INITONCE_INITIALIZER; UBool U_CALLCONV cleanup() { delete gLocaleDistance; gLocaleDistance = nullptr; gInitOnce.reset(); return TRUE; } } // namespace void U_CALLCONV LocaleDistance::initLocaleDistance(UErrorCode &errorCode) { // This function is invoked only via umtx_initOnce(). U_ASSERT(gLocaleDistance == nullptr); const XLikelySubtags &likely = *XLikelySubtags::getSingleton(errorCode); if (U_FAILURE(errorCode)) { return; } const LocaleDistanceData &data = likely.getDistanceData(); if (data.distanceTrieBytes == nullptr || data.regionToPartitions == nullptr || data.partitions == nullptr || // ok if no paradigms data.distances == nullptr) { errorCode = U_MISSING_RESOURCE_ERROR; return; } gLocaleDistance = new LocaleDistance(data, likely); if (gLocaleDistance == nullptr) { errorCode = U_MEMORY_ALLOCATION_ERROR; return; } ucln_common_registerCleanup(UCLN_COMMON_LOCALE_DISTANCE, cleanup); } const LocaleDistance *LocaleDistance::getSingleton(UErrorCode &errorCode) { if (U_FAILURE(errorCode)) { return nullptr; } umtx_initOnce(gInitOnce, &LocaleDistance::initLocaleDistance, errorCode); return gLocaleDistance; } LocaleDistance::LocaleDistance(const LocaleDistanceData &data, const XLikelySubtags &likely) : likelySubtags(likely), trie(data.distanceTrieBytes), regionToPartitionsIndex(data.regionToPartitions), partitionArrays(data.partitions), paradigmLSRs(data.paradigms), paradigmLSRsLength(data.paradigmsLength), defaultLanguageDistance(data.distances[IX_DEF_LANG_DISTANCE]), defaultScriptDistance(data.distances[IX_DEF_SCRIPT_DISTANCE]), defaultRegionDistance(data.distances[IX_DEF_REGION_DISTANCE]), minRegionDistance(data.distances[IX_MIN_REGION_DISTANCE]) { // For the default demotion value, use the // default region distance between unrelated Englishes. // Thus, unless demotion is turned off, // a mere region difference for one desired locale // is as good as a perfect match for the next following desired locale. // As of CLDR 36, we have <languageMatch desired="en_*_*" supported="en_*_*" distance="5"/>. LSR en("en", "Latn", "US", LSR::EXPLICIT_LSR); LSR enGB("en", "Latn", "GB", LSR::EXPLICIT_LSR); const LSR *p_enGB = &enGB; int32_t indexAndDistance = getBestIndexAndDistance(en, &p_enGB, 1, shiftDistance(50), ULOCMATCH_FAVOR_LANGUAGE, ULOCMATCH_DIRECTION_WITH_ONE_WAY); defaultDemotionPerDesiredLocale = getDistanceFloor(indexAndDistance); } int32_t LocaleDistance::getBestIndexAndDistance( const LSR &desired, const LSR **supportedLSRs, int32_t supportedLSRsLength, int32_t shiftedThreshold, ULocMatchFavorSubtag favorSubtag, ULocMatchDirection direction) const { BytesTrie iter(trie); // Look up the desired language only once for all supported LSRs. // Its "distance" is either a match point value of 0, or a non-match negative value. // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. int32_t desLangDistance = trieNext(iter, desired.language, false); uint64_t desLangState = desLangDistance >= 0 && supportedLSRsLength > 1 ? iter.getState64() : 0; // Index of the supported LSR with the lowest distance. int32_t bestIndex = -1; // Cached lookup info from XLikelySubtags.compareLikely(). int32_t bestLikelyInfo = -1; for (int32_t slIndex = 0; slIndex < supportedLSRsLength; ++slIndex) { const LSR &supported = *supportedLSRs[slIndex]; bool star = false; int32_t distance = desLangDistance; if (distance >= 0) { U_ASSERT((distance & DISTANCE_IS_FINAL) == 0); if (slIndex != 0) { iter.resetToState64(desLangState); } distance = trieNext(iter, supported.language, true); } // Note: The data builder verifies that there are no rules with "any" (*) language and // real (non *) script or region subtags. // This means that if the lookup for either language fails we can use // the default distances without further lookups. int32_t flags; if (distance >= 0) { flags = distance & DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; distance &= ~DISTANCE_IS_FINAL_OR_SKIP_SCRIPT; } else { // <*, *> if (uprv_strcmp(desired.language, supported.language) == 0) { distance = 0; } else { distance = defaultLanguageDistance; } flags = 0; star = true; } U_ASSERT(0 <= distance && distance <= 100); // Round up the shifted threshold (if fraction bits are not 0) // for comparison with un-shifted distances until we need fraction bits. // (If we simply shifted non-zero fraction bits away, then we might ignore a language // when it's really still a micro distance below the threshold.) int32_t roundedThreshold = (shiftedThreshold + DISTANCE_FRACTION_MASK) >> DISTANCE_SHIFT; // We implement "favor subtag" by reducing the language subtag distance // (unscientifically reducing it to a quarter of the normal value), // so that the script distance is relatively more important. // For example, given a default language distance of 80, we reduce it to 20, // which is below the default threshold of 50, which is the default script distance. if (favorSubtag == ULOCMATCH_FAVOR_SCRIPT) { distance >>= 2; } // Let distance == roundedThreshold pass until the tie-breaker logic // at the end of the loop. if (distance > roundedThreshold) { continue; } int32_t scriptDistance; if (star || flags != 0) { if (uprv_strcmp(desired.script, supported.script) == 0) { scriptDistance = 0; } else { scriptDistance = defaultScriptDistance; } } else { scriptDistance = getDesSuppScriptDistance(iter, iter.getState64(), desired.script, supported.script); flags = scriptDistance & DISTANCE_IS_FINAL; scriptDistance &= ~DISTANCE_IS_FINAL; } distance += scriptDistance; if (distance > roundedThreshold) { continue; } if (uprv_strcmp(desired.region, supported.region) == 0) { // regionDistance = 0 } else if (star || (flags & DISTANCE_IS_FINAL) != 0) { distance += defaultRegionDistance; } else { int32_t remainingThreshold = roundedThreshold - distance; if (minRegionDistance > remainingThreshold) { continue; } // From here on we know the regions are not equal. // Map each region to zero or more partitions. (zero = one non-matching string) // (Each array of single-character partition strings is encoded as one string.) // If either side has more than one, then we find the maximum distance. // This could be optimized by adding some more structure, but probably not worth it. distance += getRegionPartitionsDistance( iter, iter.getState64(), partitionsForRegion(desired), partitionsForRegion(supported), remainingThreshold); } int32_t shiftedDistance = shiftDistance(distance); if (shiftedDistance == 0) { // Distinguish between equivalent but originally unequal locales via an // additional micro distance. shiftedDistance |= (desired.flags ^ supported.flags); if (shiftedDistance < shiftedThreshold) { if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || // Is there also a match when we swap desired/supported? isMatch(supported, desired, shiftedThreshold, favorSubtag)) { if (shiftedDistance == 0) { return slIndex << INDEX_SHIFT; } bestIndex = slIndex; shiftedThreshold = shiftedDistance; bestLikelyInfo = -1; } } } else { if (shiftedDistance < shiftedThreshold) { if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || // Is there also a match when we swap desired/supported? isMatch(supported, desired, shiftedThreshold, favorSubtag)) { bestIndex = slIndex; shiftedThreshold = shiftedDistance; bestLikelyInfo = -1; } } else if (shiftedDistance == shiftedThreshold && bestIndex >= 0) { if (direction != ULOCMATCH_DIRECTION_ONLY_TWO_WAY || // Is there also a match when we swap desired/supported? isMatch(supported, desired, shiftedThreshold, favorSubtag)) { bestLikelyInfo = likelySubtags.compareLikely( supported, *supportedLSRs[bestIndex], bestLikelyInfo); if ((bestLikelyInfo & 1) != 0) { // This supported locale matches as well as the previous best match, // and neither matches perfectly, // but this one is "more likely" (has more-default subtags). bestIndex = slIndex; } } } } } return bestIndex >= 0 ? (bestIndex << INDEX_SHIFT) | shiftedThreshold : INDEX_NEG_1 | shiftDistance(ABOVE_THRESHOLD); } int32_t LocaleDistance::getDesSuppScriptDistance( BytesTrie &iter, uint64_t startState, const char *desired, const char *supported) { // Note: The data builder verifies that there are no <*, supported> or <desired, *> rules. int32_t distance = trieNext(iter, desired, false); if (distance >= 0) { distance = trieNext(iter, supported, true); } if (distance < 0) { UStringTrieResult result = iter.resetToState64(startState).next(u'*'); // <*, *> U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); if (uprv_strcmp(desired, supported) == 0) { distance = 0; // same script } else { distance = iter.getValue(); U_ASSERT(distance >= 0); } if (result == USTRINGTRIE_FINAL_VALUE) { distance |= DISTANCE_IS_FINAL; } } return distance; } int32_t LocaleDistance::getRegionPartitionsDistance( BytesTrie &iter, uint64_t startState, const char *desiredPartitions, const char *supportedPartitions, int32_t threshold) { char desired = *desiredPartitions++; char supported = *supportedPartitions++; U_ASSERT(desired != 0 && supported != 0); // See if we have single desired/supported partitions, from NUL-terminated // partition strings without explicit length. bool suppLengthGt1 = *supportedPartitions != 0; // gt1: more than 1 character // equivalent to: if (desLength == 1 && suppLength == 1) if (*desiredPartitions == 0 && !suppLengthGt1) { // Fastpath for single desired/supported partitions. UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); if (USTRINGTRIE_HAS_NEXT(result)) { result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); if (USTRINGTRIE_HAS_VALUE(result)) { return iter.getValue(); } } return getFallbackRegionDistance(iter, startState); } const char *supportedStart = supportedPartitions - 1; // for restart of inner loop int32_t regionDistance = 0; // Fall back to * only once, not for each pair of partition strings. bool star = false; for (;;) { // Look up each desired-partition string only once, // not for each (desired, supported) pair. UStringTrieResult result = iter.next(uprv_invCharToAscii(desired) | END_OF_SUBTAG); if (USTRINGTRIE_HAS_NEXT(result)) { uint64_t desState = suppLengthGt1 ? iter.getState64() : 0; for (;;) { result = iter.next(uprv_invCharToAscii(supported) | END_OF_SUBTAG); int32_t d; if (USTRINGTRIE_HAS_VALUE(result)) { d = iter.getValue(); } else if (star) { d = 0; } else { d = getFallbackRegionDistance(iter, startState); star = true; } if (d > threshold) { return d; } else if (regionDistance < d) { regionDistance = d; } if ((supported = *supportedPartitions++) != 0) { iter.resetToState64(desState); } else { break; } } } else if (!star) { int32_t d = getFallbackRegionDistance(iter, startState); if (d > threshold) { return d; } else if (regionDistance < d) { regionDistance = d; } star = true; } if ((desired = *desiredPartitions++) != 0) { iter.resetToState64(startState); supportedPartitions = supportedStart; supported = *supportedPartitions++; } else { break; } } return regionDistance; } int32_t LocaleDistance::getFallbackRegionDistance(BytesTrie &iter, uint64_t startState) { #if U_DEBUG UStringTrieResult result = #endif iter.resetToState64(startState).next(u'*'); // <*, *> U_ASSERT(USTRINGTRIE_HAS_VALUE(result)); int32_t distance = iter.getValue(); U_ASSERT(distance >= 0); return distance; } int32_t LocaleDistance::trieNext(BytesTrie &iter, const char *s, bool wantValue) { uint8_t c; if ((c = *s) == 0) { return -1; // no empty subtags in the distance data } for (;;) { c = uprv_invCharToAscii(c); // EBCDIC: If *s is not an invariant character, // then c is now 0 and will simply not match anything, which is harmless. uint8_t next = *++s; if (next != 0) { if (!USTRINGTRIE_HAS_NEXT(iter.next(c))) { return -1; } } else { // last character of this subtag UStringTrieResult result = iter.next(c | END_OF_SUBTAG); if (wantValue) { if (USTRINGTRIE_HAS_VALUE(result)) { int32_t value = iter.getValue(); if (result == USTRINGTRIE_FINAL_VALUE) { value |= DISTANCE_IS_FINAL; } return value; } } else { if (USTRINGTRIE_HAS_NEXT(result)) { return 0; } } return -1; } c = next; } } UBool LocaleDistance::isParadigmLSR(const LSR &lsr) const { // Linear search for a very short list (length 6 as of 2019), // because we look for equivalence not equality, and // because it's easy. // If there are many paradigm LSRs we should use a hash set // with custom comparator and hasher. U_ASSERT(paradigmLSRsLength <= 15); for (int32_t i = 0; i < paradigmLSRsLength; ++i) { if (lsr.isEquivalentTo(paradigmLSRs[i])) { return true; } } return false; } U_NAMESPACE_END