diff options
author | Rémi Verschelde <rverschelde@gmail.com> | 2022-01-24 11:04:45 +0100 |
---|---|---|
committer | Rémi Verschelde <rverschelde@gmail.com> | 2022-06-28 14:10:07 +0200 |
commit | 39ed39900e34e10f40d0d06ed358d2641d7f3315 (patch) | |
tree | f8337007ef2e72c51e2a508983109891366efe2d /thirdparty/zstd/compress/zstd_lazy.c | |
parent | 5ab59ee7df1d344dccc9deef375faa46deb63b8d (diff) |
zstd: Update to upstream version 1.5.2
Release notes:
- https://github.com/facebook/zstd/releases/tag/v1.5.1
- https://github.com/facebook/zstd/releases/tag/v1.5.2
Diffstat (limited to 'thirdparty/zstd/compress/zstd_lazy.c')
-rw-r--r-- | thirdparty/zstd/compress/zstd_lazy.c | 1044 |
1 files changed, 482 insertions, 562 deletions
diff --git a/thirdparty/zstd/compress/zstd_lazy.c b/thirdparty/zstd/compress/zstd_lazy.c index 3d523e8472..2e38dcb46d 100644 --- a/thirdparty/zstd/compress/zstd_lazy.c +++ b/thirdparty/zstd/compress/zstd_lazy.c @@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, * assumption : curr >= btlow == (curr - btmask) * doesn't fail */ static void -ZSTD_insertDUBT1(ZSTD_matchState_t* ms, +ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, U32 curr, const BYTE* inputEnd, U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode) @@ -93,7 +93,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms, assert(curr >= btLow); assert(ip < iend); /* condition for ZSTD_count */ - while (nbCompares-- && (matchIndex > windowLow)) { + for (; nbCompares && (matchIndex > windowLow); --nbCompares) { U32* const nextPtr = bt + 2*(matchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ assert(matchIndex < curr); @@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms, static size_t ZSTD_DUBT_findBetterDictMatch ( - ZSTD_matchState_t* ms, + const ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, size_t* offsetPtr, size_t bestLength, @@ -185,7 +185,7 @@ ZSTD_DUBT_findBetterDictMatch ( (void)dictMode; assert(dictMode == ZSTD_dictMatchState); - while (nbCompares-- && (dictMatchIndex > dictLowLimit)) { + for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) { U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ const BYTE* match = dictBase + dictMatchIndex; @@ -197,8 +197,8 @@ ZSTD_DUBT_findBetterDictMatch ( U32 matchIndex = dictMatchIndex + dictIndexDelta; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", - curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex); - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; + curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); + bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); } if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ break; /* drop, to guarantee consistency (miss a little bit of compression) */ @@ -218,7 +218,7 @@ ZSTD_DUBT_findBetterDictMatch ( } if (bestLength >= MINMATCH) { - U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } @@ -309,7 +309,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, matchIndex = hashTable[h]; hashTable[h] = curr; /* Update Hash Table */ - while (nbCompares-- && (matchIndex > windowLow)) { + for (; nbCompares && (matchIndex > windowLow); --nbCompares) { U32* const nextPtr = bt + 2*(matchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ const BYTE* match; @@ -328,7 +328,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (matchLength > matchEndIdx - matchIndex) matchEndIdx = matchIndex + (U32)matchLength; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) - bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex; + bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ if (dictMode == ZSTD_dictMatchState) { nbCompares = 0; /* in addition to avoiding checking any @@ -357,6 +357,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, *smallerPtr = *largerPtr = 0; + assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ if (dictMode == ZSTD_dictMatchState && nbCompares) { bestLength = ZSTD_DUBT_findBetterDictMatch( ms, ip, iend, @@ -367,7 +368,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ if (bestLength >= MINMATCH) { - U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } @@ -390,54 +391,6 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); } - -static size_t -ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); - } -} - - -static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); - } -} - - -static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); - } -} - /*********************************** * Dedicated dict search ***********************************/ @@ -450,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B U32* const chainTable = ms->chainTable; U32 const chainSize = 1 << ms->cParams.chainLog; U32 idx = ms->nextToUpdate; - U32 const minChain = chainSize < target ? target - chainSize : idx; + U32 const minChain = chainSize < target - idx ? target - chainSize : idx; U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; U32 const cacheSize = bucketSize - 1; U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize; @@ -464,7 +417,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; U32* const tmpHashTable = hashTable; U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog); - U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; + U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog; U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx; U32 hashIdx; @@ -608,7 +561,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; + *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); if (ip+currentMl == iLimit) { /* best possible, avoids read overflow on next attempt */ return ml; @@ -645,7 +598,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE; + *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } } @@ -692,7 +645,7 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { /* inlining is important to hardwire a hot branch (template emulation) */ FORCE_INLINE_TEMPLATE -size_t ZSTD_HcFindBestMatch_generic ( +size_t ZSTD_HcFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, size_t* offsetPtr, @@ -750,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE; + *offsetPtr = STORE_OFFSET(curr - matchIndex); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } @@ -758,6 +711,7 @@ size_t ZSTD_HcFindBestMatch_generic ( matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); } + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ if (dictMode == ZSTD_dedicatedDictSearch) { ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms, ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); @@ -784,7 +738,8 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } @@ -797,310 +752,80 @@ size_t ZSTD_HcFindBestMatch_generic ( return ml; } - -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); - } -} - - -static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); - } -} - - -static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch); - } -} - - -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); - } -} - /* ********************************* * (SIMD) Row-based matchfinder ***********************************/ /* Constants for row-based hash */ -#define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ -#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ +#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) +#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ #define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) -typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */ - -#if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */ - -#include <emmintrin.h> -typedef __m128i ZSTD_Vec128; - -/* Returns a 128-bit container with 128-bits from src */ -static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) { - return _mm_loadu_si128((ZSTD_Vec128 const*)src); -} - -/* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */ -static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) { - return _mm_set1_epi8((char)val); -} - -/* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask - * into a 32-bit mask that is the MSB of each byte. - * */ -static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) { - return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y)); -} - -typedef struct { - __m128i fst; - __m128i snd; -} ZSTD_Vec256; - -static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { - ZSTD_Vec256 v; - v.fst = ZSTD_Vec128_read(ptr); - v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1); - return v; -} - -static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { - ZSTD_Vec256 v; - v.fst = ZSTD_Vec128_set8(val); - v.snd = ZSTD_Vec128_set8(val); - return v; -} - -static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { - ZSTD_VecMask fstMask; - ZSTD_VecMask sndMask; - fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst); - sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd); - return fstMask | (sndMask << 16); -} - -#elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */ - -#include <arm_neon.h> -typedef uint8x16_t ZSTD_Vec128; - -static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) { - return vld1q_u8((const BYTE* const)src); -} - -static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) { - return vdupq_n_u8(val); -} - -/* Mimics '_mm_movemask_epi8()' from SSE */ -static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) { - /* Shift out everything but the MSB bits in each byte */ - uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7)); - /* Merge the even lanes together with vsra (right shift and add) */ - uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7)); - uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - /* Extract the low 8 bits from each lane, merge */ - return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8); -} - -static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) { - return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y)); -} - -typedef struct { - uint8x16_t fst; - uint8x16_t snd; -} ZSTD_Vec256; - -static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) { - ZSTD_Vec256 v; - v.fst = ZSTD_Vec128_read(ptr); - v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1); - return v; -} - -static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { - ZSTD_Vec256 v; - v.fst = ZSTD_Vec128_set8(val); - v.snd = ZSTD_Vec128_set8(val); - return v; -} - -static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { - ZSTD_VecMask fstMask; - ZSTD_VecMask sndMask; - fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst); - sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd); - return fstMask | (sndMask << 16); -} - -#else /* Scalar fallback version */ - -#define VEC128_NB_SIZE_T (16 / sizeof(size_t)) -typedef struct { - size_t vec[VEC128_NB_SIZE_T]; -} ZSTD_Vec128; - -static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) { - ZSTD_Vec128 ret; - ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t)); - return ret; -} - -static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) { - ZSTD_Vec128 ret = { {0} }; - int startBit = sizeof(size_t) * 8 - 8; - for (;startBit >= 0; startBit -= 8) { - unsigned j = 0; - for (;j < VEC128_NB_SIZE_T; ++j) { - ret.vec[j] |= ((size_t)val << startBit); - } - } - return ret; -} - -/* Compare x to y, byte by byte, generating a "matches" bitfield */ -static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) { - ZSTD_VecMask res = 0; - unsigned i = 0; - unsigned l = 0; - for (; i < VEC128_NB_SIZE_T; ++i) { - const size_t cmp1 = x.vec[i]; - const size_t cmp2 = y.vec[i]; - unsigned j = 0; - for (; j < sizeof(size_t); ++j, ++l) { - if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) { - res |= ((U32)1 << (j+i*sizeof(size_t))); - } - } - } - return res; -} - -#define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T -typedef struct { - size_t vec[VEC256_NB_SIZE_T]; -} ZSTD_Vec256; - -static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) { - ZSTD_Vec256 ret; - ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t)); - return ret; -} - -static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) { - ZSTD_Vec256 ret = { {0} }; - int startBit = sizeof(size_t) * 8 - 8; - for (;startBit >= 0; startBit -= 8) { - unsigned j = 0; - for (;j < VEC256_NB_SIZE_T; ++j) { - ret.vec[j] |= ((size_t)val << startBit); - } - } - return ret; -} - -/* Compare x to y, byte by byte, generating a "matches" bitfield */ -static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) { - ZSTD_VecMask res = 0; - unsigned i = 0; - unsigned l = 0; - for (; i < VEC256_NB_SIZE_T; ++i) { - const size_t cmp1 = x.vec[i]; - const size_t cmp2 = y.vec[i]; - unsigned j = 0; - for (; j < sizeof(size_t); ++j, ++l) { - if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) { - res |= ((U32)1 << (j+i*sizeof(size_t))); - } - } - } - return res; -} - -#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */ +typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */ /* ZSTD_VecMask_next(): * Starting from the LSB, returns the idx of the next non-zero bit. * Basically counting the nb of trailing zeroes. */ static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { -# if defined(_MSC_VER) /* Visual */ - unsigned long r=0; - return _BitScanForward(&r, val) ? (U32)r : 0; -# elif defined(__GNUC__) && (__GNUC__ >= 3) - return (U32)__builtin_ctz(val); + assert(val != 0); +# if defined(_MSC_VER) && defined(_WIN64) + if (val != 0) { + unsigned long r; + _BitScanForward64(&r, val); + return (U32)(r); + } else { + /* Should not reach this code path */ + __assume(0); + } +# elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) + if (sizeof(size_t) == 4) { + U32 mostSignificantWord = (U32)(val >> 32); + U32 leastSignificantWord = (U32)val; + if (leastSignificantWord == 0) { + return 32 + (U32)__builtin_ctz(mostSignificantWord); + } else { + return (U32)__builtin_ctz(leastSignificantWord); + } + } else { + return (U32)__builtin_ctzll(val); + } # else - /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */ - static const U32 multiplyDeBruijnBitPosition[32] = - { - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 - }; - return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27]; + /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count + * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer + */ + val = ~val & (val - 1ULL); /* Lowest set bit mask */ + val = val - ((val >> 1) & 0x5555555555555555); + val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); + return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); # endif } -/* ZSTD_VecMask_rotateRight(): - * Rotates a bitfield to the right by "rotation" bits. - * If the rotation is greater than totalBits, the returned mask is 0. +/* ZSTD_rotateRight_*(): + * Rotates a bitfield to the right by "count" bits. + * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts */ -FORCE_INLINE_TEMPLATE ZSTD_VecMask -ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) { - if (rotation == 0) - return mask; - switch (totalBits) { - default: - assert(0); - case 16: - return (mask >> rotation) | (U16)(mask << (16 - rotation)); - case 32: - return (mask >> rotation) | (U32)(mask << (32 - rotation)); - } +FORCE_INLINE_TEMPLATE +U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { + assert(count < 64); + count &= 0x3F; /* for fickle pattern recognition */ + return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { + assert(count < 32); + count &= 0x1F; /* for fickle pattern recognition */ + return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +} + +FORCE_INLINE_TEMPLATE +U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { + assert(count < 16); + count &= 0x0F; /* for fickle pattern recognition */ + return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); } /* ZSTD_row_nextIndex(): @@ -1126,20 +851,24 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { */ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { PREFETCH_L1(hashTable + relRow); - if (rowLog == 5) { + if (rowLog >= 5) { PREFETCH_L1(hashTable + relRow + 16); + /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */ } PREFETCH_L1(tagTable + relRow); - assert(rowLog == 4 || rowLog == 5); + if (rowLog == 6) { + PREFETCH_L1(tagTable + relRow + 32); + } + assert(rowLog == 4 || rowLog == 5 || rowLog == 6); assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */ - assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */ + assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */ } /* ZSTD_row_fillHashCache(): * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, * but not beyond iLimit. */ -static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, +FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, U32 const rowLog, U32 const mls, U32 idx, const BYTE* const iLimit) { @@ -1179,35 +908,65 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab } } -/* ZSTD_row_update_internal(): - * Inserts the byte at ip into the appropriate position in the hash table. - * Determines the relative row, and the position within the {16, 32} entry row to insert at. +/* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. */ -FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, - U32 const mls, U32 const rowLog, - U32 const rowMask, U32 const useCache) +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + U32 updateStartIdx, U32 const updateEndIdx, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) { U32* const hashTable = ms->hashTable; U16* const tagTable = ms->tagTable; U32 const hashLog = ms->rowHashLog; const BYTE* const base = ms->window.base; - const U32 target = (U32)(ip - base); - U32 idx = ms->nextToUpdate; - DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target); - for (; idx < target; ++idx) { - U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls) - : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) + : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; U32* const row = hashTable + relRow; BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. Explicit cast allows us to get exact desired position within each row */ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); - assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); + assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; - row[pos] = idx; + row[pos] = updateStartIdx; } +} + +/* ZSTD_row_update_internal(): + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, + U32 const mls, U32 const rowLog, + U32 const rowMask, U32 const useCache) +{ + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + const U32 kSkipThreshold = 384; + const U32 kMaxMatchStartPositionsToUpdate = 96; + const U32 kMaxMatchEndPositionsToUpdate = 32; + + if (useCache) { + /* Only skip positions when using hash cache, i.e. + * if we are loading a dict, don't skip anything. + * If we decide to skip, then we only update a set number + * of positions at the beginning and end of the match. + */ + if (UNLIKELY(target - idx > kSkipThreshold)) { + U32 const bound = idx + kMaxMatchStartPositionsToUpdate; + ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache); + idx = target - kMaxMatchEndPositionsToUpdate; + ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); + } + } + assert(target >= idx); + ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache); ms->nextToUpdate = target; } @@ -1216,7 +975,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const * processing. */ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { - const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5; + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); const U32 rowMask = (1u << rowLog) - 1; const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); @@ -1224,26 +983,131 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); } +#if defined(ZSTD_ARCH_X86_SSE2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head) +{ + const __m128i comparisonMask = _mm_set1_epi8((char)tag); + int matches[4] = {0}; + int i; + assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4); + for (i=0; i<nbChunks; i++) { + const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i)); + const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask); + matches[i] = _mm_movemask_epi8(equalMask); + } + if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head); + if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head); + assert(nbChunks == 4); + return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head); +} +#endif + /* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches * the hash at the nth position in a row of the tagTable. - */ -FORCE_INLINE_TEMPLATE -ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) { - ZSTD_VecMask matches = 0; - if (rowEntries == 16) { - ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET); - ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag); - matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags); - } else if (rowEntries == 32) { - ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET); - ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag); - matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags); - } else { - assert(0); + * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield + * to match up with the actual layout of the entries within the hashTable */ +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) +{ + const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); + +#if defined(ZSTD_ARCH_X86_SSE2) + + return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); + +#else /* SW or NEON-LE */ + +# if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { + if (rowEntries == 16) { + const uint8x16_t chunk = vld1q_u8(src); + const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); + const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); + const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); + const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); + const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); + const U16 hi = (U16)vgetq_lane_u8(t3, 8); + const U16 lo = (U16)vgetq_lane_u8(t3, 0); + return ZSTD_rotateRight_U16((hi << 8) | lo, head); + } else if (rowEntries == 32) { + const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); + const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); + const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); + const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); + const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); + const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); + const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); + const uint8x8_t t0 = vreinterpret_u8_s8(pack0); + const uint8x8_t t1 = vreinterpret_u8_s8(pack1); + const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); + const uint8x8x2_t t3 = vuzp_u8(t2, t0); + const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); + const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); + return ZSTD_rotateRight_U32(matches, head); + } else { /* rowEntries == 64 */ + const uint8x16x4_t chunk = vld4q_u8(src); + const uint8x16_t dup = vdupq_n_u8(tag); + const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); + const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); + const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); + const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); + + const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); + const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); + const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); + const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); + const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); + const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); + return ZSTD_rotateRight_U64(matches, head); + } } - /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield - to match up with the actual layout of the entries within the hashTable */ - return ZSTD_VecMask_rotateRight(matches, head, rowEntries); +# endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ + { const size_t chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; + const size_t x80 = x01 << 7; + const size_t splatChar = tag * x01; + ZSTD_VecMask matches = 0; + int i = rowEntries - chunkSize; + assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8)); + if (MEM_isLittleEndian()) { /* runtime check so have two loops */ + const size_t extractMagic = (xFF / 0x7F) >> chunkSize; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= (chunk * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } else { /* big endian: reverse bits during extraction */ + const size_t msb = xFF ^ (xFF >> 1); + const size_t extractMagic = (msb / 0x1FF) | msb; + do { + size_t chunk = MEM_readST(&src[i]); + chunk ^= splatChar; + chunk = (((chunk | x80) - x01) | chunk) & x80; + matches <<= chunkSize; + matches |= ((chunk >> 7) * extractMagic) >> shiftAmount; + i -= chunkSize; + } while (i >= 0); + } + matches = ~matches; + if (rowEntries == 16) { + return ZSTD_rotateRight_U16((U16)matches, head); + } else if (rowEntries == 32) { + return ZSTD_rotateRight_U32((U32)matches, head); + } else { + return ZSTD_rotateRight_U64((U64)matches, head); + } + } +#endif } /* The high-level approach of the SIMD row based match finder is as follows: @@ -1262,7 +1126,7 @@ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, con * - Pick the longest match. */ FORCE_INLINE_TEMPLATE -size_t ZSTD_RowFindBestMatch_generic ( +size_t ZSTD_RowFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, size_t* offsetPtr, @@ -1293,11 +1157,13 @@ size_t ZSTD_RowFindBestMatch_generic ( /* DMS/DDS variables that may be referenced laster */ const ZSTD_matchState_t* const dms = ms->dictMatchState; - size_t ddsIdx; - U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ - U32 dmsTag; - U32* dmsRow; - BYTE* dmsTagRow; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; + U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */ + U32 dmsTag = 0; + U32* dmsRow = NULL; + BYTE* dmsTagRow = NULL; if (dictMode == ZSTD_dedicatedDictSearch) { const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; @@ -1329,7 +1195,7 @@ size_t ZSTD_RowFindBestMatch_generic ( U32* const row = hashTable + relRow; BYTE* tagRow = (BYTE*)(tagTable + relRow); U32 const head = *tagRow & rowMask; - U32 matchBuffer[32 /* maximum nb entries per row */]; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; size_t numMatches = 0; size_t currMatch = 0; ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); @@ -1379,12 +1245,13 @@ size_t ZSTD_RowFindBestMatch_generic ( /* Save best solution */ if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE; + *offsetPtr = STORE_OFFSET(curr - matchIndex); if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ } } } + assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */ if (dictMode == ZSTD_dedicatedDictSearch) { ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms, ip, iLimit, prefixStart, curr, dictLimit, ddsIdx); @@ -1397,7 +1264,7 @@ size_t ZSTD_RowFindBestMatch_generic ( const U32 dmsIndexDelta = dictLimit - dmsSize; { U32 const head = *dmsTagRow & rowMask; - U32 matchBuffer[32 /* maximum nb row entries */]; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; size_t numMatches = 0; size_t currMatch = 0; ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); @@ -1426,7 +1293,8 @@ size_t ZSTD_RowFindBestMatch_generic ( if (currentMl > ml) { ml = currentMl; - *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); if (ip+currentMl == iLimit) break; } } @@ -1435,84 +1303,175 @@ size_t ZSTD_RowFindBestMatch_generic ( return ml; } -/* Inlining is important to hardwire a hot branch (template emulation) */ -FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog); - case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog); - case 7 : - case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog); - } -} -FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5); - switch(cappedSearchLog) - { - default : - case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4); - case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5); +typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); + +/** + * This struct contains the functions necessary for lazy to search. + * Currently, that is only searchMax. However, it is still valuable to have the + * VTable because this makes it easier to add more functions to the VTable later. + * + * TODO: The start of the search function involves loading and calculating a + * bunch of constants from the ZSTD_matchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. + * + * TODO: Move the match re-winding into searchMax. This improves compression + * ratio, and unlocks further simplifications with the next TODO. + * + * TODO: Try moving the repcode search into searchMax. After the re-winding + * and repcode search are in searchMax, there is no more logic in the match + * finder loop that requires knowledge about the dictMode. So we should be + * able to avoid force inlining it, and we can join the extDict loop with + * the single segment loop. It should go in searchMax instead of its own + * function to avoid having multiple virtual function calls per search. + */ +typedef struct { + searchMax_f searchMax; +} ZSTD_LazyVTable; + +#define GEN_ZSTD_BT_VTABLE(dictMode, mls) \ + static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ + } \ + static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \ + ZSTD_BtFindBestMatch_##dictMode##_##mls \ + }; + +#define GEN_ZSTD_HC_VTABLE(dictMode, mls) \ + static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \ + } \ + static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \ + ZSTD_HcFindBestMatch_##dictMode##_##mls \ + }; + +#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \ + static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \ + ZSTD_matchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \ + assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \ + return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \ + } \ + static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \ + ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \ + }; + +#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ + X(dictMode, mls, 4) \ + X(dictMode, mls, 5) \ + X(dictMode, mls, 6) + +#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) + +#define ZSTD_FOR_EACH_MLS(X, dictMode) \ + X(dictMode, 4) \ + X(dictMode, 5) \ + X(dictMode, 6) + +#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ + X(__VA_ARGS__, noDict) \ + X(__VA_ARGS__, extDict) \ + X(__VA_ARGS__, dictMatchState) \ + X(__VA_ARGS__, dedicatedDictSearch) + +/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE) +/* Generate Binary Tree VTables for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE) +/* Generate Hash Chain VTables for each combination of (dictMode, mls) */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE) + +#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \ + { \ + &ZSTD_BtVTable_##dictMode##_4, \ + &ZSTD_BtVTable_##dictMode##_5, \ + &ZSTD_BtVTable_##dictMode##_6 \ } -} -FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5); - switch(cappedSearchLog) - { - default : - case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4); - case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5); +#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \ + { \ + &ZSTD_HcVTable_##dictMode##_4, \ + &ZSTD_HcVTable_##dictMode##_5, \ + &ZSTD_HcVTable_##dictMode##_6 \ } -} -FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5); - switch(cappedSearchLog) - { - default : - case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4); - case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5); +#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \ + { \ + &ZSTD_RowVTable_##dictMode##_##mls##_4, \ + &ZSTD_RowVTable_##dictMode##_##mls##_5, \ + &ZSTD_RowVTable_##dictMode##_##mls##_6 \ } -} -FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5); - switch(cappedSearchLog) - { - default : - case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4); - case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5); +#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \ + { \ + GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \ + GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \ + GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \ } -} +#define GEN_ZSTD_VTABLE_ARRAY(X) \ + { \ + X(noDict), \ + X(extDict), \ + X(dictMatchState), \ + X(dedicatedDictSearch) \ + } /* ******************************* * Common parser - lazy strategy *********************************/ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e; +/** + * This table is indexed first by the four ZSTD_dictMode_e values, and then + * by the two searchMethod_e values. NULLs are placed for configurations + * that should never occur (extDict modes go to the other implementation + * below and there is no DDSS for binary tree search yet). + */ + +static ZSTD_LazyVTable const* +ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode) +{ + /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */ + ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY); + ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY); + /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */ + ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY); + + U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch)); + U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog)); + switch (searchMethod) { + case search_hashChain: + return hcVTables[dictMode][mls - 4]; + case search_binaryTree: + return btVTables[dictMode][mls - 4]; + case search_rowHash: + return rowVTables[dictMode][mls - 4][rowLog - 4]; + default: + return NULL; + } +} + FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, @@ -1525,46 +1484,12 @@ ZSTD_compressBlock_lazy_generic( const BYTE* ip = istart; const BYTE* anchor = istart; const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; + const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base = ms->window.base; const U32 prefixLowestIndex = ms->window.dictLimit; const BYTE* const prefixLowest = base + prefixLowestIndex; - const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5; - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - - /** - * This table is indexed first by the four ZSTD_dictMode_e values, and then - * by the two searchMethod_e values. NULLs are placed for configurations - * that should never occur (extDict modes go to the other implementation - * below and there is no DDSS for binary tree search yet). - */ - const searchMax_f searchFuncs[4][3] = { - { - ZSTD_HcFindBestMatch_selectMLS, - ZSTD_BtFindBestMatch_selectMLS, - ZSTD_RowFindBestMatch_selectRowLog - }, - { - NULL, - NULL, - NULL - }, - { - ZSTD_HcFindBestMatch_dictMatchState_selectMLS, - ZSTD_BtFindBestMatch_dictMatchState_selectMLS, - ZSTD_RowFindBestMatch_dictMatchState_selectRowLog - }, - { - ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, - NULL, - ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog - } - }; - - searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod]; + searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax; U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; const int isDMS = dictMode == ZSTD_dictMatchState; @@ -1599,6 +1524,7 @@ ZSTD_compressBlock_lazy_generic( } if (searchMethod == search_rowHash) { + const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog)); ZSTD_row_fillHashCache(ms, base, rowLog, MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), ms->nextToUpdate, ilimit); @@ -1613,8 +1539,9 @@ ZSTD_compressBlock_lazy_generic( #endif while (ip < ilimit) { size_t matchLength=0; - size_t offset=0; + size_t offcode=STORE_REPCODE_1; const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); /* check repCode */ if (isDxS) { @@ -1640,7 +1567,7 @@ ZSTD_compressBlock_lazy_generic( { size_t offsetFound = 999999999; size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; + matchLength = ml2, start = ip, offcode=offsetFound; } if (matchLength < 4) { @@ -1651,14 +1578,15 @@ ZSTD_compressBlock_lazy_generic( /* let's try to find a better solution */ if (depth>=1) while (ip<ilimit) { + DEBUGLOG(7, "search depth 1"); ip ++; if ( (dictMode == ZSTD_noDict) - && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1670,30 +1598,31 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } } { size_t offset2=999999999; size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; /* search a better one */ } } /* let's find an even better one */ if ((depth==2) && (ip<ilimit)) { + DEBUGLOG(7, "search depth 2"); ip ++; if ( (dictMode == ZSTD_noDict) - && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } if (isDxS) { const U32 repIndex = (U32)(ip - base) - offset_1; @@ -1705,46 +1634,45 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 = (int)(mlRep * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >= 4) && (gain2 > gain1)) - matchLength = mlRep, offset = 0, start = ip; + matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; } } { size_t offset2=999999999; size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; } } } break; /* nothing found : store previous solution */ } /* NOTE: - * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. - * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which - * overflows the pointer, which is undefined behavior. + * Pay attention that `start[-value]` can lead to strange undefined behavior + * notably if `value` is unsigned, resulting in a large positive `-value`. */ /* catch up */ - if (offset) { + if (STORED_IS_OFFSET(offcode)) { if (dictMode == ZSTD_noDict) { - while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest)) - && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ + while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) + && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ { start--; matchLength++; } } if (isDxS) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ } - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); } /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); anchor = ip = start + matchLength; } @@ -1760,8 +1688,8 @@ _storeSequence: && (MEM_read32(repMatch) == MEM_read32(ip)) ) { const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ip += matchLength; anchor = ip; continue; @@ -1775,8 +1703,8 @@ _storeSequence: && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ @@ -1955,15 +1883,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 windowLog = ms->cParams.windowLog; const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5; - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); - const searchMax_f searchFuncs[3] = { - ZSTD_HcFindBestMatch_extDict_selectMLS, - ZSTD_BtFindBestMatch_extDict_selectMLS, - ZSTD_RowFindBestMatch_extDict_selectRowLog - }; - searchMax_f searchMax = searchFuncs[(int)searchMethod]; + searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax; U32 offset_1 = rep[0], offset_2 = rep[1]; DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); @@ -1985,7 +1905,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( #endif while (ip < ilimit) { size_t matchLength=0; - size_t offset=0; + size_t offcode=STORE_REPCODE_1; const BYTE* start=ip+1; U32 curr = (U32)(ip-base); @@ -1995,7 +1915,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ - & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */ + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ if (MEM_read32(ip+1) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; @@ -2007,10 +1927,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( { size_t offsetFound = 999999999; size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); if (ml2 > matchLength) - matchLength = ml2, start = ip, offset=offsetFound; + matchLength = ml2, start = ip, offcode=offsetFound; } - if (matchLength < 4) { + if (matchLength < 4) { ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ continue; } @@ -2021,30 +1941,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offset) { + if (offcode) { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ - & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 3); - int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; + matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; } } /* search match, depth 1 */ { size_t offset2=999999999; size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; /* search a better one */ } } @@ -2053,48 +1973,48 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offset) { + if (offcode) { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); const U32 repIndex = (U32)(curr - offset_1); const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ - & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 = (int)(repLength * 4); - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((repLength >= 4) && (gain2 > gain1)) - matchLength = repLength, offset = 0, start = ip; + matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; } } /* search match, depth 2 */ { size_t offset2=999999999; size_t const ml2 = searchMax(ms, ip, iend, &offset2); - int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ - int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); if ((ml2 >= 4) && (gain2 > gain1)) { - matchLength = ml2, offset = offset2, start = ip; + matchLength = ml2, offcode = offset2, start = ip; continue; } } } break; /* nothing found : store previous solution */ } /* catch up */ - if (offset) { - U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + if (STORED_IS_OFFSET(offcode)) { + U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ - offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); } /* store sequence */ _storeSequence: - { size_t const litLength = start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + { size_t const litLength = (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); anchor = ip = start + matchLength; } @@ -2106,13 +2026,13 @@ _storeSequence: const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; const BYTE* const repMatch = repBase + repIndex; if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ - & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ if (MEM_read32(ip) == MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; - offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ip += matchLength; anchor = ip; continue; /* faster when present ... (?) */ |