diff options
Diffstat (limited to 'thirdparty/zstd/compress/zstd_lazy.c')
| -rw-r--r-- | thirdparty/zstd/compress/zstd_lazy.c | 459 | 
1 files changed, 378 insertions, 81 deletions
diff --git a/thirdparty/zstd/compress/zstd_lazy.c b/thirdparty/zstd/compress/zstd_lazy.c index 9ad7e03b54..49ec1b09ef 100644 --- a/thirdparty/zstd/compress/zstd_lazy.c +++ b/thirdparty/zstd/compress/zstd_lazy.c @@ -1,5 +1,5 @@  /* - * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.   * All rights reserved.   *   * This source code is licensed under both the BSD-style license (found in the @@ -58,11 +58,11 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,  /** ZSTD_insertDUBT1() :   *  sort one already inserted but unsorted position - *  assumption : current >= btlow == (current - btmask) + *  assumption : curr >= btlow == (curr - btmask)   *  doesn't fail */  static void  ZSTD_insertDUBT1(ZSTD_matchState_t* ms, -                 U32 current, const BYTE* inputEnd, +                 U32 curr, const BYTE* inputEnd,                   U32 nbCompares, U32 btLow,                   const ZSTD_dictMode_e dictMode)  { @@ -74,41 +74,41 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,      const BYTE* const base = ms->window.base;      const BYTE* const dictBase = ms->window.dictBase;      const U32 dictLimit = ms->window.dictLimit; -    const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current; -    const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit; +    const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr; +    const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;      const BYTE* const dictEnd = dictBase + dictLimit;      const BYTE* const prefixStart = base + dictLimit;      const BYTE* match; -    U32* smallerPtr = bt + 2*(current&btMask); +    U32* smallerPtr = bt + 2*(curr&btMask);      U32* largerPtr  = smallerPtr + 1;      U32 matchIndex = *smallerPtr;   /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */      U32 dummy32;   /* to be nullified at the end */      U32 const windowValid = ms->window.lowLimit;      U32 const maxDistance = 1U << cParams->windowLog; -    U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid; +    U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;      DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", -                current, dictLimit, windowLow); -    assert(current >= btLow); +                curr, dictLimit, windowLow); +    assert(curr >= btLow);      assert(ip < iend);   /* condition for ZSTD_count */      while (nbCompares-- && (matchIndex > windowLow)) {          U32* const nextPtr = bt + 2*(matchIndex & btMask);          size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */ -        assert(matchIndex < current); +        assert(matchIndex < curr);          /* note : all candidates are now supposed sorted,           * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK           * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */          if ( (dictMode != ZSTD_extDict)            || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/ -          || (current < dictLimit) /* both in extDict */) { +          || (curr < dictLimit) /* both in extDict */) {              const BYTE* const mBase = ( (dictMode != ZSTD_extDict)                                       || (matchIndex+matchLength >= dictLimit)) ?                                          base : dictBase;              assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */ -                 || (current < dictLimit) ); +                 || (curr < dictLimit) );              match = mBase + matchIndex;              matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);          } else { @@ -119,7 +119,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,          }          DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", -                    current, matchIndex, (U32)matchLength); +                    curr, matchIndex, (U32)matchLength);          if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */              break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ @@ -168,7 +168,7 @@ ZSTD_DUBT_findBetterDictMatch (      const BYTE* const base = ms->window.base;      const BYTE* const prefixStart = base + ms->window.dictLimit; -    U32         const current = (U32)(ip-base); +    U32         const curr = (U32)(ip-base);      const BYTE* const dictBase = dms->window.base;      const BYTE* const dictEnd = dms->window.nextSrc;      U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); @@ -195,10 +195,10 @@ ZSTD_DUBT_findBetterDictMatch (          if (matchLength > bestLength) {              U32 matchIndex = dictMatchIndex + dictIndexDelta; -            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { +            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {                  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", -                    current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex); -                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; +                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex); +                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;              }              if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */                  break;   /* drop, to guarantee consistency (miss a little bit of compression) */ @@ -218,9 +218,9 @@ ZSTD_DUBT_findBetterDictMatch (      }      if (bestLength >= MINMATCH) { -        U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; +        U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;          DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", -                    current, (U32)bestLength, (U32)*offsetPtr, mIndex); +                    curr, (U32)bestLength, (U32)*offsetPtr, mIndex);      }      return bestLength; @@ -241,13 +241,13 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,      U32          matchIndex  = hashTable[h];      const BYTE* const base = ms->window.base; -    U32    const current = (U32)(ip-base); -    U32    const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog); +    U32    const curr = (U32)(ip-base); +    U32    const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);      U32*   const bt = ms->chainTable;      U32    const btLog  = cParams->chainLog - 1;      U32    const btMask = (1 << btLog) - 1; -    U32    const btLow = (btMask >= current) ? 0 : current - btMask; +    U32    const btLow = (btMask >= curr) ? 0 : curr - btMask;      U32    const unsortLimit = MAX(btLow, windowLow);      U32*         nextCandidate = bt + 2*(matchIndex&btMask); @@ -256,8 +256,9 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,      U32          nbCandidates = nbCompares;      U32          previousCandidate = 0; -    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current); +    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", curr);      assert(ip <= iend-8);   /* required for h calculation */ +    assert(dictMode != ZSTD_dedicatedDictSearch);      /* reach end of unsorted candidates list */      while ( (matchIndex > unsortLimit) @@ -299,14 +300,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,          const U32 dictLimit = ms->window.dictLimit;          const BYTE* const dictEnd = dictBase + dictLimit;          const BYTE* const prefixStart = base + dictLimit; -        U32* smallerPtr = bt + 2*(current&btMask); -        U32* largerPtr  = bt + 2*(current&btMask) + 1; -        U32 matchEndIdx = current + 8 + 1; +        U32* smallerPtr = bt + 2*(curr&btMask); +        U32* largerPtr  = bt + 2*(curr&btMask) + 1; +        U32 matchEndIdx = curr + 8 + 1;          U32 dummy32;   /* to be nullified at the end */          size_t bestLength = 0;          matchIndex  = hashTable[h]; -        hashTable[h] = current;   /* Update Hash Table */ +        hashTable[h] = curr;   /* Update Hash Table */          while (nbCompares-- && (matchIndex > windowLow)) {              U32* const nextPtr = bt + 2*(matchIndex & btMask); @@ -326,8 +327,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,              if (matchLength > bestLength) {                  if (matchLength > matchEndIdx - matchIndex)                      matchEndIdx = matchIndex + (U32)matchLength; -                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) -                    bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; +                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +                    bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;                  if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */                      if (dictMode == ZSTD_dictMatchState) {                          nbCompares = 0; /* in addition to avoiding checking any @@ -363,12 +364,12 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,                      mls, dictMode);          } -        assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */ +        assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */          ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */          if (bestLength >= MINMATCH) { -            U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; +            U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;              DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", -                        current, (U32)bestLength, (U32)*offsetPtr, mIndex); +                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);          }          return bestLength;      } @@ -446,7 +447,7 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (  /* Update chains up to ip (excluded)     Assumption : always within prefix (i.e. not within extDict) */ -static U32 ZSTD_insertAndFindFirstIndex_internal( +FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(                          ZSTD_matchState_t* ms,                          const ZSTD_compressionParameters* const cParams,                          const BYTE* ip, U32 const mls) @@ -475,6 +476,121 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {      return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);  } +void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) +{ +    const BYTE* const base = ms->window.base; +    U32 const target = (U32)(ip - base); +    U32* const hashTable = ms->hashTable; +    U32* const chainTable = ms->chainTable; +    U32 const chainSize = 1 << ms->cParams.chainLog; +    U32 idx = ms->nextToUpdate; +    U32 const minChain = chainSize < target ? target - chainSize : idx; +    U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; +    U32 const cacheSize = bucketSize - 1; +    U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; +    U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts; + +    /* We know the hashtable is oversized by a factor of `bucketSize`. +     * We are going to temporarily pretend `bucketSize == 1`, keeping only a +     * single entry. We will use the rest of the space to construct a temporary +     * chaintable. +     */ +    U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; +    U32* const tmpHashTable = hashTable; +    U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); +    U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; +    U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; + +    U32 hashIdx; + +    assert(ms->cParams.chainLog <= 24); +    assert(ms->cParams.hashLog >= ms->cParams.chainLog); +    assert(idx != 0); +    assert(tmpMinChain <= minChain); + +    /* fill conventional hash table and conventional chain table */ +    for ( ; idx < target; idx++) { +        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch); +        if (idx >= tmpMinChain) { +            tmpChainTable[idx - tmpMinChain] = hashTable[h]; +        } +        tmpHashTable[h] = idx; +    } + +    /* sort chains into ddss chain table */ +    { +        U32 chainPos = 0; +        for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) { +            U32 count; +            U32 countBeyondMinChain = 0; +            U32 i = tmpHashTable[hashIdx]; +            for (count = 0; i >= tmpMinChain && count < cacheSize; count++) { +                /* skip through the chain to the first position that won't be +                 * in the hash cache bucket */ +                if (i < minChain) { +                    countBeyondMinChain++; +                } +                i = tmpChainTable[i - tmpMinChain]; +            } +            if (count == cacheSize) { +                for (count = 0; count < chainLimit;) { +                    if (i < minChain) { +                        if (!i || countBeyondMinChain++ > cacheSize) { +                            /* only allow pulling `cacheSize` number of entries +                             * into the cache or chainTable beyond `minChain`, +                             * to replace the entries pulled out of the +                             * chainTable into the cache. This lets us reach +                             * back further without increasing the total number +                             * of entries in the chainTable, guaranteeing the +                             * DDSS chain table will fit into the space +                             * allocated for the regular one. */ +                            break; +                        } +                    } +                    chainTable[chainPos++] = i; +                    count++; +                    if (i < tmpMinChain) { +                        break; +                    } +                    i = tmpChainTable[i - tmpMinChain]; +                } +            } else { +                count = 0; +            } +            if (count) { +                tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count; +            } else { +                tmpHashTable[hashIdx] = 0; +            } +        } +        assert(chainPos <= chainSize); /* I believe this is guaranteed... */ +    } + +    /* move chain pointers into the last entry of each hash bucket */ +    for (hashIdx = (1 << hashLog); hashIdx; ) { +        U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG; +        U32 const chainPackedPointer = tmpHashTable[hashIdx]; +        U32 i; +        for (i = 0; i < cacheSize; i++) { +            hashTable[bucketIdx + i] = 0; +        } +        hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer; +    } + +    /* fill the buckets of the hash table */ +    for (idx = ms->nextToUpdate; idx < target; idx++) { +        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch) +                   << ZSTD_LAZY_DDSS_BUCKET_LOG; +        U32 i; +        /* Shift hash cache down 1. */ +        for (i = cacheSize - 1; i; i--) +            hashTable[h + i] = hashTable[h + i - 1]; +        hashTable[h] = idx; +    } + +    ms->nextToUpdate = target; +} +  /* inlining is important to hardwire a hot branch (template emulation) */  FORCE_INLINE_TEMPLATE @@ -493,20 +609,33 @@ size_t ZSTD_HcFindBestMatch_generic (      const U32 dictLimit = ms->window.dictLimit;      const BYTE* const prefixStart = base + dictLimit;      const BYTE* const dictEnd = dictBase + dictLimit; -    const U32 current = (U32)(ip-base); +    const U32 curr = (U32)(ip-base);      const U32 maxDistance = 1U << cParams->windowLog;      const U32 lowestValid = ms->window.lowLimit; -    const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; +    const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;      const U32 isDictionary = (ms->loadedDictEnd != 0);      const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; -    const U32 minChain = current > chainSize ? current - chainSize : 0; +    const U32 minChain = curr > chainSize ? curr - chainSize : 0;      U32 nbAttempts = 1U << cParams->searchLog;      size_t ml=4-1; +    const ZSTD_matchState_t* const dms = ms->dictMatchState; +    const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch +                         ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; +    const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch +                        ? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + +    U32 matchIndex; + +    if (dictMode == ZSTD_dedicatedDictSearch) { +        const U32* entry = &dms->hashTable[ddsIdx]; +        PREFETCH_L1(entry); +    } +      /* HC4 match finder */ -    U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); +    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); -    for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) { +    for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {          size_t currentMl=0;          if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {              const BYTE* const match = base + matchIndex; @@ -523,7 +652,7 @@ size_t ZSTD_HcFindBestMatch_generic (          /* save best solution */          if (currentMl > ml) {              ml = currentMl; -            *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; +            *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;              if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */          } @@ -531,8 +660,92 @@ size_t ZSTD_HcFindBestMatch_generic (          matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);      } -    if (dictMode == ZSTD_dictMatchState) { -        const ZSTD_matchState_t* const dms = ms->dictMatchState; +    if (dictMode == ZSTD_dedicatedDictSearch) { +        const U32 ddsLowestIndex  = dms->window.dictLimit; +        const BYTE* const ddsBase = dms->window.base; +        const BYTE* const ddsEnd  = dms->window.nextSrc; +        const U32 ddsSize         = (U32)(ddsEnd - ddsBase); +        const U32 ddsIndexDelta   = dictLimit - ddsSize; +        const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); +        const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1; +        U32 ddsAttempt; + +        for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { +            PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); +        } + +        { +            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; +            U32 const chainIndex = chainPackedPointer >> 8; + +            PREFETCH_L1(&dms->chainTable[chainIndex]); +        } + +        for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) { +            size_t currentMl=0; +            const BYTE* match; +            matchIndex = dms->hashTable[ddsIdx + ddsAttempt]; +            match = ddsBase + matchIndex; + +            if (!matchIndex) { +                return ml; +            } + +            /* guaranteed by table construction */ +            (void)ddsLowestIndex; +            assert(matchIndex >= ddsLowestIndex); +            assert(match+4 <= ddsEnd); +            if (MEM_read32(match) == MEM_read32(ip)) { +                /* assumption : matchIndex <= dictLimit-4 (by table construction) */ +                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; +            } + +            /* save best solution */ +            if (currentMl > ml) { +                ml = currentMl; +                *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; +                if (ip+currentMl == iLimit) { +                    /* best possible, avoids read overflow on next attempt */ +                    return ml; +                } +            } +        } + +        { +            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1]; +            U32 chainIndex = chainPackedPointer >> 8; +            U32 const chainLength = chainPackedPointer & 0xFF; +            U32 const chainAttempts = nbAttempts - ddsAttempt; +            U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts; +            U32 chainAttempt; + +            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) { +                PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]); +            } + +            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) { +                size_t currentMl=0; +                const BYTE* match; +                matchIndex = dms->chainTable[chainIndex]; +                match = ddsBase + matchIndex; + +                /* guaranteed by table construction */ +                assert(matchIndex >= ddsLowestIndex); +                assert(match+4 <= ddsEnd); +                if (MEM_read32(match) == MEM_read32(ip)) { +                    /* assumption : matchIndex <= dictLimit-4 (by table construction) */ +                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4; +                } + +                /* save best solution */ +                if (currentMl > ml) { +                    ml = currentMl; +                    *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; +                    if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ +                } +            } +        } +    } else if (dictMode == ZSTD_dictMatchState) {          const U32* const dmsChainTable = dms->chainTable;          const U32 dmsChainSize         = (1 << dms->cParams.chainLog);          const U32 dmsChainMask         = dmsChainSize - 1; @@ -545,7 +758,7 @@ size_t ZSTD_HcFindBestMatch_generic (          matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; -        for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { +        for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {              size_t currentMl=0;              const BYTE* const match = dmsBase + matchIndex;              assert(match+4 <= dmsEnd); @@ -555,11 +768,12 @@ size_t ZSTD_HcFindBestMatch_generic (              /* save best solution */              if (currentMl > ml) {                  ml = currentMl; -                *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; +                *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */              }              if (matchIndex <= dmsMinChain) break; +              matchIndex = dmsChainTable[matchIndex & dmsChainMask];          }      } @@ -600,6 +814,22 @@ static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (  } +static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS ( +                        ZSTD_matchState_t* ms, +                        const BYTE* ip, const BYTE* const iLimit, +                        size_t* offsetPtr) +{ +    switch(ms->cParams.minMatch) +    { +    default : /* includes case 3 */ +    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch); +    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch); +    case 7 : +    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch); +    } +} + +  FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (                          ZSTD_matchState_t* ms,                          const BYTE* ip, const BYTE* const iLimit, @@ -641,35 +871,62 @@ ZSTD_compressBlock_lazy_generic(      typedef size_t (*searchMax_f)(                          ZSTD_matchState_t* ms,                          const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); -    searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ? -        (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS -                                         : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) : -        (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS -                                         : ZSTD_HcFindBestMatch_selectMLS); + +    /** +     * This table is indexed first by the four ZSTD_dictMode_e values, and then +     * by the two searchMethod_e values. NULLs are placed for configurations +     * that should never occur (extDict modes go to the other implementation +     * below and there is no DDSS for binary tree search yet). +     */ +    const searchMax_f searchFuncs[4][2] = { +        { +            ZSTD_HcFindBestMatch_selectMLS, +            ZSTD_BtFindBestMatch_selectMLS +        }, +        { +            NULL, +            NULL +        }, +        { +            ZSTD_HcFindBestMatch_dictMatchState_selectMLS, +            ZSTD_BtFindBestMatch_dictMatchState_selectMLS +        }, +        { +            ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, +            NULL +        } +    }; + +    searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree];      U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; +    const int isDMS = dictMode == ZSTD_dictMatchState; +    const int isDDS = dictMode == ZSTD_dedicatedDictSearch; +    const int isDxS = isDMS || isDDS;      const ZSTD_matchState_t* const dms = ms->dictMatchState; -    const U32 dictLowestIndex      = dictMode == ZSTD_dictMatchState ? -                                     dms->window.dictLimit : 0; -    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ? -                                     dms->window.base : NULL; -    const BYTE* const dictLowest   = dictMode == ZSTD_dictMatchState ? -                                     dictBase + dictLowestIndex : NULL; -    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ? -                                     dms->window.nextSrc : NULL; -    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ? +    const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0; +    const BYTE* const dictBase     = isDxS ? dms->window.base : NULL; +    const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL; +    const BYTE* const dictEnd      = isDxS ? dms->window.nextSrc : NULL; +    const U32 dictIndexDelta       = isDxS ?                                       prefixLowestIndex - (U32)(dictEnd - dictBase) :                                       0; -    const U32 dictAndPrefixLength = (U32)(ip - prefixLowest + dictEnd - dictLowest); +    const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); + +    assert(searchMax != NULL); + +    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);      /* init */      ip += (dictAndPrefixLength == 0);      if (dictMode == ZSTD_noDict) { -        U32 const maxRep = (U32)(ip - prefixLowest); +        U32 const curr = (U32)(ip - base); +        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); +        U32 const maxRep = curr - windowLow;          if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;          if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;      } -    if (dictMode == ZSTD_dictMatchState) { +    if (isDxS) {          /* dictMatchState repCode checks don't currently handle repCode == 0           * disabling. */          assert(offset_1 <= dictAndPrefixLength); @@ -677,15 +934,21 @@ ZSTD_compressBlock_lazy_generic(      }      /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) +    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the +     * code alignment is perturbed. To fix the instability align the loop on 32-bytes. +     */ +    __asm__(".p2align 5"); +#endif      while (ip < ilimit) {          size_t matchLength=0;          size_t offset=0;          const BYTE* start=ip+1;          /* check repCode */ -        if (dictMode == ZSTD_dictMatchState) { +        if (isDxS) {              const U32 repIndex = (U32)(ip - base) + 1 - offset_1; -            const BYTE* repMatch = (dictMode == ZSTD_dictMatchState +            const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)                                  && repIndex < prefixLowestIndex) ?                                     dictBase + (repIndex - dictIndexDelta) :                                     base + repIndex; @@ -726,7 +989,7 @@ ZSTD_compressBlock_lazy_generic(                  if ((mlRep >= 4) && (gain2 > gain1))                      matchLength = mlRep, offset = 0, start = ip;              } -            if (dictMode == ZSTD_dictMatchState) { +            if (isDxS) {                  const U32 repIndex = (U32)(ip - base) - offset_1;                  const BYTE* repMatch = repIndex < prefixLowestIndex ?                                 dictBase + (repIndex - dictIndexDelta) : @@ -761,7 +1024,7 @@ ZSTD_compressBlock_lazy_generic(                      if ((mlRep >= 4) && (gain2 > gain1))                          matchLength = mlRep, offset = 0, start = ip;                  } -                if (dictMode == ZSTD_dictMatchState) { +                if (isDxS) {                      const U32 repIndex = (U32)(ip - base) - offset_1;                      const BYTE* repMatch = repIndex < prefixLowestIndex ?                                     dictBase + (repIndex - dictIndexDelta) : @@ -799,7 +1062,7 @@ ZSTD_compressBlock_lazy_generic(                       && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */                      { start--; matchLength++; }              } -            if (dictMode == ZSTD_dictMatchState) { +            if (isDxS) {                  U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));                  const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;                  const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; @@ -815,12 +1078,11 @@ _storeSequence:          }          /* check immediate repcode */ -        if (dictMode == ZSTD_dictMatchState) { +        if (isDxS) {              while (ip <= ilimit) {                  U32 const current2 = (U32)(ip-base);                  U32 const repIndex = current2 - offset_2; -                const BYTE* repMatch = dictMode == ZSTD_dictMatchState -                    && repIndex < prefixLowestIndex ? +                const BYTE* repMatch = repIndex < prefixLowestIndex ?                          dictBase - dictIndexDelta + repIndex :                          base + repIndex;                  if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) @@ -915,6 +1177,28 @@ size_t ZSTD_compressBlock_greedy_dictMatchState(  } +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +        void const* src, size_t srcSize) +{ +    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); +} + +  FORCE_INLINE_TEMPLATE  size_t ZSTD_compressBlock_lazy_extDict_generic(                          ZSTD_matchState_t* ms, seqStore_t* seqStore, @@ -929,11 +1213,11 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(      const BYTE* const ilimit = iend - 8;      const BYTE* const base = ms->window.base;      const U32 dictLimit = ms->window.dictLimit; -    const U32 lowestIndex = ms->window.lowLimit;      const BYTE* const prefixStart = base + dictLimit;      const BYTE* const dictBase = ms->window.dictBase;      const BYTE* const dictEnd  = dictBase + dictLimit; -    const BYTE* const dictStart  = dictBase + lowestIndex; +    const BYTE* const dictStart  = dictBase + ms->window.lowLimit; +    const U32 windowLog = ms->cParams.windowLog;      typedef size_t (*searchMax_f)(                          ZSTD_matchState_t* ms, @@ -942,21 +1226,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(      U32 offset_1 = rep[0], offset_2 = rep[1]; +    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); +      /* init */      ip += (ip == prefixStart);      /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) +    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the +     * code alignment is perturbed. To fix the instability align the loop on 32-bytes. +     */ +    __asm__(".p2align 5"); +#endif      while (ip < ilimit) {          size_t matchLength=0;          size_t offset=0;          const BYTE* start=ip+1; -        U32 current = (U32)(ip-base); +        U32 curr = (U32)(ip-base);          /* check repCode */ -        {   const U32 repIndex = (U32)(current+1 - offset_1); +        {   const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+1, windowLog); +            const U32 repIndex = (U32)(curr+1 - offset_1);              const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;              const BYTE* const repMatch = repBase + repIndex; -            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))   /* intentional overflow */ +            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))   /* intentional overflow */              if (MEM_read32(ip+1) == MEM_read32(repMatch)) {                  /* repcode detected we should take it */                  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -980,13 +1273,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(          if (depth>=1)          while (ip<ilimit) {              ip ++; -            current++; +            curr++;              /* check repCode */              if (offset) { -                const U32 repIndex = (U32)(current - offset_1); +                const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); +                const U32 repIndex = (U32)(curr - offset_1);                  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;                  const BYTE* const repMatch = repBase + repIndex; -                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */ +                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */                  if (MEM_read32(ip) == MEM_read32(repMatch)) {                      /* repcode detected */                      const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1010,13 +1304,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(              /* let's find an even better one */              if ((depth==2) && (ip<ilimit)) {                  ip ++; -                current++; +                curr++;                  /* check repCode */                  if (offset) { -                    const U32 repIndex = (U32)(current - offset_1); +                    const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); +                    const U32 repIndex = (U32)(curr - offset_1);                      const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;                      const BYTE* const repMatch = repBase + repIndex; -                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */ +                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */                      if (MEM_read32(ip) == MEM_read32(repMatch)) {                          /* repcode detected */                          const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -1057,10 +1352,12 @@ _storeSequence:          /* check immediate repcode */          while (ip <= ilimit) { -            const U32 repIndex = (U32)((ip-base) - offset_2); +            const U32 repCurrent = (U32)(ip-base); +            const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); +            const U32 repIndex = repCurrent - offset_2;              const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;              const BYTE* const repMatch = repBase + repIndex; -            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > lowestIndex))  /* intentional overflow */ +            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */              if (MEM_read32(ip) == MEM_read32(repMatch)) {                  /* repcode detected we should take it */                  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;  |