summaryrefslogtreecommitdiff
path: root/thirdparty/zstd/compress/zstd_lazy.c
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/zstd/compress/zstd_lazy.c')
-rw-r--r--thirdparty/zstd/compress/zstd_lazy.c1056
1 files changed, 914 insertions, 142 deletions
diff --git a/thirdparty/zstd/compress/zstd_lazy.c b/thirdparty/zstd/compress/zstd_lazy.c
index 49ec1b09ef..3d523e8472 100644
--- a/thirdparty/zstd/compress/zstd_lazy.c
+++ b/thirdparty/zstd/compress/zstd_lazy.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) Yann Collet, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under both the BSD-style license (found in the
@@ -438,43 +438,9 @@ static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
}
}
-
-
-/* *********************************
-* Hash Chain
+/***********************************
+* Dedicated dict search
***********************************/
-#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
-
-/* Update chains up to ip (excluded)
- Assumption : always within prefix (i.e. not within extDict) */
-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
- ZSTD_matchState_t* ms,
- const ZSTD_compressionParameters* const cParams,
- const BYTE* ip, U32 const mls)
-{
- U32* const hashTable = ms->hashTable;
- const U32 hashLog = cParams->hashLog;
- U32* const chainTable = ms->chainTable;
- const U32 chainMask = (1 << cParams->chainLog) - 1;
- const BYTE* const base = ms->window.base;
- const U32 target = (U32)(ip - base);
- U32 idx = ms->nextToUpdate;
-
- while(idx < target) { /* catch up */
- size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
- NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
- hashTable[h] = idx;
- idx++;
- }
-
- ms->nextToUpdate = target;
- return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
-}
-
-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
- const ZSTD_compressionParameters* const cParams = &ms->cParams;
- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
-}
void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
{
@@ -500,11 +466,10 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
-
U32 hashIdx;
assert(ms->cParams.chainLog <= 24);
- assert(ms->cParams.hashLog >= ms->cParams.chainLog);
+ assert(ms->cParams.hashLog > ms->cParams.chainLog);
assert(idx != 0);
assert(tmpMinChain <= minChain);
@@ -535,7 +500,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
if (count == cacheSize) {
for (count = 0; count < chainLimit;) {
if (i < minChain) {
- if (!i || countBeyondMinChain++ > cacheSize) {
+ if (!i || ++countBeyondMinChain > cacheSize) {
/* only allow pulling `cacheSize` number of entries
* into the cache or chainTable beyond `minChain`,
* to replace the entries pulled out of the
@@ -591,6 +556,139 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
ms->nextToUpdate = target;
}
+/* Returns the longest match length found in the dedicated dict search structure.
+ * If none are longer than the argument ml, then ml will be returned.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
+ const ZSTD_matchState_t* const dms,
+ const BYTE* const ip, const BYTE* const iLimit,
+ const BYTE* const prefixStart, const U32 curr,
+ const U32 dictLimit, const size_t ddsIdx) {
+ const U32 ddsLowestIndex = dms->window.dictLimit;
+ const BYTE* const ddsBase = dms->window.base;
+ const BYTE* const ddsEnd = dms->window.nextSrc;
+ const U32 ddsSize = (U32)(ddsEnd - ddsBase);
+ const U32 ddsIndexDelta = dictLimit - ddsSize;
+ const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
+ const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
+ U32 ddsAttempt;
+ U32 matchIndex;
+
+ for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
+ PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
+ }
+
+ {
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+ U32 const chainIndex = chainPackedPointer >> 8;
+
+ PREFETCH_L1(&dms->chainTable[chainIndex]);
+ }
+
+ for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
+ size_t currentMl=0;
+ const BYTE* match;
+ matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
+ match = ddsBase + matchIndex;
+
+ if (!matchIndex) {
+ return ml;
+ }
+
+ /* guaranteed by table construction */
+ (void)ddsLowestIndex;
+ assert(matchIndex >= ddsLowestIndex);
+ assert(match+4 <= ddsEnd);
+ if (MEM_read32(match) == MEM_read32(ip)) {
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+ }
+
+ /* save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+ if (ip+currentMl == iLimit) {
+ /* best possible, avoids read overflow on next attempt */
+ return ml;
+ }
+ }
+ }
+
+ {
+ U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+ U32 chainIndex = chainPackedPointer >> 8;
+ U32 const chainLength = chainPackedPointer & 0xFF;
+ U32 const chainAttempts = nbAttempts - ddsAttempt;
+ U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
+ U32 chainAttempt;
+
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
+ PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
+ }
+
+ for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
+ size_t currentMl=0;
+ const BYTE* match;
+ matchIndex = dms->chainTable[chainIndex];
+ match = ddsBase + matchIndex;
+
+ /* guaranteed by table construction */
+ assert(matchIndex >= ddsLowestIndex);
+ assert(match+4 <= ddsEnd);
+ if (MEM_read32(match) == MEM_read32(ip)) {
+ /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+ }
+
+ /* save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+ *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+ }
+ }
+ return ml;
+}
+
+
+/* *********************************
+* Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
+
+/* Update chains up to ip (excluded)
+ Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+ ZSTD_matchState_t* ms,
+ const ZSTD_compressionParameters* const cParams,
+ const BYTE* ip, U32 const mls)
+{
+ U32* const hashTable = ms->hashTable;
+ const U32 hashLog = cParams->hashLog;
+ U32* const chainTable = ms->chainTable;
+ const U32 chainMask = (1 << cParams->chainLog) - 1;
+ const BYTE* const base = ms->window.base;
+ const U32 target = (U32)(ip - base);
+ U32 idx = ms->nextToUpdate;
+
+ while(idx < target) { /* catch up */
+ size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+ NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+ hashTable[h] = idx;
+ idx++;
+ }
+
+ ms->nextToUpdate = target;
+ return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+}
/* inlining is important to hardwire a hot branch (template emulation) */
FORCE_INLINE_TEMPLATE
@@ -661,90 +759,8 @@ size_t ZSTD_HcFindBestMatch_generic (
}
if (dictMode == ZSTD_dedicatedDictSearch) {
- const U32 ddsLowestIndex = dms->window.dictLimit;
- const BYTE* const ddsBase = dms->window.base;
- const BYTE* const ddsEnd = dms->window.nextSrc;
- const U32 ddsSize = (U32)(ddsEnd - ddsBase);
- const U32 ddsIndexDelta = dictLimit - ddsSize;
- const U32 bucketSize = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
- const U32 bucketLimit = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
- U32 ddsAttempt;
-
- for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
- PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
- }
-
- {
- U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
- U32 const chainIndex = chainPackedPointer >> 8;
-
- PREFETCH_L1(&dms->chainTable[chainIndex]);
- }
-
- for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
- size_t currentMl=0;
- const BYTE* match;
- matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
- match = ddsBase + matchIndex;
-
- if (!matchIndex) {
- return ml;
- }
-
- /* guaranteed by table construction */
- (void)ddsLowestIndex;
- assert(matchIndex >= ddsLowestIndex);
- assert(match+4 <= ddsEnd);
- if (MEM_read32(match) == MEM_read32(ip)) {
- /* assumption : matchIndex <= dictLimit-4 (by table construction) */
- currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
- }
-
- /* save best solution */
- if (currentMl > ml) {
- ml = currentMl;
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
- if (ip+currentMl == iLimit) {
- /* best possible, avoids read overflow on next attempt */
- return ml;
- }
- }
- }
-
- {
- U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
- U32 chainIndex = chainPackedPointer >> 8;
- U32 const chainLength = chainPackedPointer & 0xFF;
- U32 const chainAttempts = nbAttempts - ddsAttempt;
- U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
- U32 chainAttempt;
-
- for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
- PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
- }
-
- for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
- size_t currentMl=0;
- const BYTE* match;
- matchIndex = dms->chainTable[chainIndex];
- match = ddsBase + matchIndex;
-
- /* guaranteed by table construction */
- assert(matchIndex >= ddsLowestIndex);
- assert(match+4 <= ddsEnd);
- if (MEM_read32(match) == MEM_read32(ip)) {
- /* assumption : matchIndex <= dictLimit-4 (by table construction) */
- currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
- }
-
- /* save best solution */
- if (currentMl > ml) {
- ml = currentMl;
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
- if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
- }
- }
- }
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
} else if (dictMode == ZSTD_dictMatchState) {
const U32* const dmsChainTable = dms->chainTable;
const U32 dmsChainSize = (1 << dms->cParams.chainLog);
@@ -845,11 +861,657 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
}
}
+/* *********************************
+* (SIMD) Row-based matchfinder
+***********************************/
+/* Constants for row-based hash */
+#define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
+#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+
+#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
+
+typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
+
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
+
+#include <emmintrin.h>
+typedef __m128i ZSTD_Vec128;
+
+/* Returns a 128-bit container with 128-bits from src */
+static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
+ return _mm_loadu_si128((ZSTD_Vec128 const*)src);
+}
+
+/* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
+static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
+ return _mm_set1_epi8((char)val);
+}
+
+/* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
+ * into a 32-bit mask that is the MSB of each byte.
+ * */
+static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
+ return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
+}
+
+typedef struct {
+ __m128i fst;
+ __m128i snd;
+} ZSTD_Vec256;
+
+static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
+ ZSTD_Vec256 v;
+ v.fst = ZSTD_Vec128_read(ptr);
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
+ return v;
+}
+
+static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
+ ZSTD_Vec256 v;
+ v.fst = ZSTD_Vec128_set8(val);
+ v.snd = ZSTD_Vec128_set8(val);
+ return v;
+}
+
+static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
+ ZSTD_VecMask fstMask;
+ ZSTD_VecMask sndMask;
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
+ return fstMask | (sndMask << 16);
+}
+
+#elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
+
+#include <arm_neon.h>
+typedef uint8x16_t ZSTD_Vec128;
+
+static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
+ return vld1q_u8((const BYTE* const)src);
+}
+
+static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
+ return vdupq_n_u8(val);
+}
+
+/* Mimics '_mm_movemask_epi8()' from SSE */
+static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
+ /* Shift out everything but the MSB bits in each byte */
+ uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
+ /* Merge the even lanes together with vsra (right shift and add) */
+ uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
+ uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+ uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+ /* Extract the low 8 bits from each lane, merge */
+ return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
+}
+
+static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
+ return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
+}
+
+typedef struct {
+ uint8x16_t fst;
+ uint8x16_t snd;
+} ZSTD_Vec256;
+
+static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
+ ZSTD_Vec256 v;
+ v.fst = ZSTD_Vec128_read(ptr);
+ v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
+ return v;
+}
+
+static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
+ ZSTD_Vec256 v;
+ v.fst = ZSTD_Vec128_set8(val);
+ v.snd = ZSTD_Vec128_set8(val);
+ return v;
+}
+
+static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
+ ZSTD_VecMask fstMask;
+ ZSTD_VecMask sndMask;
+ fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
+ sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
+ return fstMask | (sndMask << 16);
+}
+
+#else /* Scalar fallback version */
+
+#define VEC128_NB_SIZE_T (16 / sizeof(size_t))
+typedef struct {
+ size_t vec[VEC128_NB_SIZE_T];
+} ZSTD_Vec128;
+
+static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
+ ZSTD_Vec128 ret;
+ ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
+ return ret;
+}
+
+static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
+ ZSTD_Vec128 ret = { {0} };
+ int startBit = sizeof(size_t) * 8 - 8;
+ for (;startBit >= 0; startBit -= 8) {
+ unsigned j = 0;
+ for (;j < VEC128_NB_SIZE_T; ++j) {
+ ret.vec[j] |= ((size_t)val << startBit);
+ }
+ }
+ return ret;
+}
+
+/* Compare x to y, byte by byte, generating a "matches" bitfield */
+static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
+ ZSTD_VecMask res = 0;
+ unsigned i = 0;
+ unsigned l = 0;
+ for (; i < VEC128_NB_SIZE_T; ++i) {
+ const size_t cmp1 = x.vec[i];
+ const size_t cmp2 = y.vec[i];
+ unsigned j = 0;
+ for (; j < sizeof(size_t); ++j, ++l) {
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
+ }
+ }
+ }
+ return res;
+}
+
+#define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
+typedef struct {
+ size_t vec[VEC256_NB_SIZE_T];
+} ZSTD_Vec256;
+
+static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
+ ZSTD_Vec256 ret;
+ ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
+ return ret;
+}
+
+static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
+ ZSTD_Vec256 ret = { {0} };
+ int startBit = sizeof(size_t) * 8 - 8;
+ for (;startBit >= 0; startBit -= 8) {
+ unsigned j = 0;
+ for (;j < VEC256_NB_SIZE_T; ++j) {
+ ret.vec[j] |= ((size_t)val << startBit);
+ }
+ }
+ return ret;
+}
+
+/* Compare x to y, byte by byte, generating a "matches" bitfield */
+static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
+ ZSTD_VecMask res = 0;
+ unsigned i = 0;
+ unsigned l = 0;
+ for (; i < VEC256_NB_SIZE_T; ++i) {
+ const size_t cmp1 = x.vec[i];
+ const size_t cmp2 = y.vec[i];
+ unsigned j = 0;
+ for (; j < sizeof(size_t); ++j, ++l) {
+ if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
+ res |= ((U32)1 << (j+i*sizeof(size_t)));
+ }
+ }
+ }
+ return res;
+}
+
+#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
+
+/* ZSTD_VecMask_next():
+ * Starting from the LSB, returns the idx of the next non-zero bit.
+ * Basically counting the nb of trailing zeroes.
+ */
+static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+# if defined(_MSC_VER) /* Visual */
+ unsigned long r=0;
+ return _BitScanForward(&r, val) ? (U32)r : 0;
+# elif defined(__GNUC__) && (__GNUC__ >= 3)
+ return (U32)__builtin_ctz(val);
+# else
+ /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
+ static const U32 multiplyDeBruijnBitPosition[32] =
+ {
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+ };
+ return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
+# endif
+}
+
+/* ZSTD_VecMask_rotateRight():
+ * Rotates a bitfield to the right by "rotation" bits.
+ * If the rotation is greater than totalBits, the returned mask is 0.
+ */
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
+ if (rotation == 0)
+ return mask;
+ switch (totalBits) {
+ default:
+ assert(0);
+ case 16:
+ return (mask >> rotation) | (U16)(mask << (16 - rotation));
+ case 32:
+ return (mask >> rotation) | (U32)(mask << (32 - rotation));
+ }
+}
+
+/* ZSTD_row_nextIndex():
+ * Returns the next index to insert at within a tagTable row, and updates the "head"
+ * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
+ U32 const next = (*tagRow - 1) & rowMask;
+ *tagRow = (BYTE)next;
+ return next;
+}
+
+/* ZSTD_isAligned():
+ * Checks that a pointer is aligned to "align" bytes which must be a power of 2.
+ */
+MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+ assert((align & (align - 1)) == 0);
+ return (((size_t)ptr) & (align - 1)) == 0;
+}
+
+/* ZSTD_row_prefetch():
+ * Performs prefetching for the hashTable and tagTable at a given row.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
+ PREFETCH_L1(hashTable + relRow);
+ if (rowLog == 5) {
+ PREFETCH_L1(hashTable + relRow + 16);
+ }
+ PREFETCH_L1(tagTable + relRow);
+ assert(rowLog == 4 || rowLog == 5);
+ assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
+}
+
+/* ZSTD_row_fillHashCache():
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+ * but not beyond iLimit.
+ */
+static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+ U32 const rowLog, U32 const mls,
+ U32 idx, const BYTE* const iLimit)
+{
+ U32 const* const hashTable = ms->hashTable;
+ U16 const* const tagTable = ms->tagTable;
+ U32 const hashLog = ms->rowHashLog;
+ U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
+ U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
+
+ for (; idx < lim; ++idx) {
+ U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+ U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+ ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
+ }
+
+ DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[0], ms->hashCache[1],
+ ms->hashCache[2], ms->hashCache[3], ms->hashCache[4],
+ ms->hashCache[5], ms->hashCache[6], ms->hashCache[7]);
+}
+
+/* ZSTD_row_nextCachedHash():
+ * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+ U16 const* tagTable, BYTE const* base,
+ U32 idx, U32 const hashLog,
+ U32 const rowLog, U32 const mls)
+{
+ U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+ U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+ { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+ cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
+ return hash;
+ }
+}
+
+/* ZSTD_row_update_internal():
+ * Inserts the byte at ip into the appropriate position in the hash table.
+ * Determines the relative row, and the position within the {16, 32} entry row to insert at.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+ U32 const mls, U32 const rowLog,
+ U32 const rowMask, U32 const useCache)
+{
+ U32* const hashTable = ms->hashTable;
+ U16* const tagTable = ms->tagTable;
+ U32 const hashLog = ms->rowHashLog;
+ const BYTE* const base = ms->window.base;
+ const U32 target = (U32)(ip - base);
+ U32 idx = ms->nextToUpdate;
+
+ DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
+ for (; idx < target; ++idx) {
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
+ : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ U32* const row = hashTable + relRow;
+ BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
+ Explicit cast allows us to get exact desired position within each row */
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+
+ assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+ ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
+ row[pos] = idx;
+ }
+ ms->nextToUpdate = target;
+}
+
+/* ZSTD_row_update():
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
+ * processing.
+ */
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
+ const U32 rowMask = (1u << rowLog) - 1;
+ const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+
+ DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
+}
+
+/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
+ * the hash at the nth position in a row of the tagTable.
+ */
+FORCE_INLINE_TEMPLATE
+ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
+ ZSTD_VecMask matches = 0;
+ if (rowEntries == 16) {
+ ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
+ ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
+ matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
+ } else if (rowEntries == 32) {
+ ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
+ ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
+ matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
+ } else {
+ assert(0);
+ }
+ /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+ to match up with the actual layout of the entries within the hashTable */
+ return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
+}
+
+/* The high-level approach of the SIMD row based match finder is as follows:
+ * - Figure out where to insert the new entry:
+ * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+ * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
+ * which row to insert into.
+ * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+ * be considered as a circular buffer with a "head" index that resides in the tagTable.
+ * - Also insert the "tag" into the equivalent row and position in the tagTable.
+ * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+ * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+ * for alignment/performance reasons, leaving some bytes unused.
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
+ * generate a bitfield that we can cycle through to check the collisions in the hash table.
+ * - Pick the longest match.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_RowFindBestMatch_generic (
+ ZSTD_matchState_t* ms,
+ const BYTE* const ip, const BYTE* const iLimit,
+ size_t* offsetPtr,
+ const U32 mls, const ZSTD_dictMode_e dictMode,
+ const U32 rowLog)
+{
+ U32* const hashTable = ms->hashTable;
+ U16* const tagTable = ms->tagTable;
+ U32* const hashCache = ms->hashCache;
+ const U32 hashLog = ms->rowHashLog;
+ const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const dictBase = ms->window.dictBase;
+ const U32 dictLimit = ms->window.dictLimit;
+ const BYTE* const prefixStart = base + dictLimit;
+ const BYTE* const dictEnd = dictBase + dictLimit;
+ const U32 curr = (U32)(ip-base);
+ const U32 maxDistance = 1U << cParams->windowLog;
+ const U32 lowestValid = ms->window.lowLimit;
+ const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+ const U32 isDictionary = (ms->loadedDictEnd != 0);
+ const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+ const U32 rowEntries = (1U << rowLog);
+ const U32 rowMask = rowEntries - 1;
+ const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+ U32 nbAttempts = 1U << cappedSearchLog;
+ size_t ml=4-1;
+
+ /* DMS/DDS variables that may be referenced laster */
+ const ZSTD_matchState_t* const dms = ms->dictMatchState;
+ size_t ddsIdx;
+ U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
+ U32 dmsTag;
+ U32* dmsRow;
+ BYTE* dmsTagRow;
+
+ if (dictMode == ZSTD_dedicatedDictSearch) {
+ const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+ { /* Prefetch DDS hashtable entry */
+ ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
+ PREFETCH_L1(&dms->hashTable[ddsIdx]);
+ }
+ ddsExtraAttempts = cParams->searchLog > rowLog ? 1U << (cParams->searchLog - rowLog) : 0;
+ }
+
+ if (dictMode == ZSTD_dictMatchState) {
+ /* Prefetch DMS rows */
+ U32* const dmsHashTable = dms->hashTable;
+ U16* const dmsTagTable = dms->tagTable;
+ U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+ U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+ dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
+ dmsRow = dmsHashTable + dmsRelRow;
+ ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
+ }
+
+ /* Update the hashTable and tagTable up to (but not including) ip */
+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+ { /* Get the hash for ip, compute the appropriate row */
+ U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
+ U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+ U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
+ U32* const row = hashTable + relRow;
+ BYTE* tagRow = (BYTE*)(tagTable + relRow);
+ U32 const head = *tagRow & rowMask;
+ U32 matchBuffer[32 /* maximum nb entries per row */];
+ size_t numMatches = 0;
+ size_t currMatch = 0;
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
+
+ /* Cycle through the matches and prefetch */
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+ U32 const matchIndex = row[matchPos];
+ assert(numMatches < rowEntries);
+ if (matchIndex < lowLimit)
+ break;
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+ PREFETCH_L1(base + matchIndex);
+ } else {
+ PREFETCH_L1(dictBase + matchIndex);
+ }
+ matchBuffer[numMatches++] = matchIndex;
+ }
+
+ /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
+ in ZSTD_row_update_internal() at the next search. */
+ {
+ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
+ tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
+ row[pos] = ms->nextToUpdate++;
+ }
+
+ /* Return the longest match */
+ for (; currMatch < numMatches; ++currMatch) {
+ U32 const matchIndex = matchBuffer[currMatch];
+ size_t currentMl=0;
+ assert(matchIndex < curr);
+ assert(matchIndex >= lowLimit);
+
+ if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+ const BYTE* const match = base + matchIndex;
+ assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */
+ if (match[ml] == ip[ml]) /* potentially better */
+ currentMl = ZSTD_count(ip, match, iLimit);
+ } else {
+ const BYTE* const match = dictBase + matchIndex;
+ assert(match+4 <= dictEnd);
+ if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+ }
+
+ /* Save best solution */
+ if (currentMl > ml) {
+ ml = currentMl;
+ *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
+ if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+ }
+ }
+ }
+
+ if (dictMode == ZSTD_dedicatedDictSearch) {
+ ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
+ ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
+ } else if (dictMode == ZSTD_dictMatchState) {
+ /* TODO: Measure and potentially add prefetching to DMS */
+ const U32 dmsLowestIndex = dms->window.dictLimit;
+ const BYTE* const dmsBase = dms->window.base;
+ const BYTE* const dmsEnd = dms->window.nextSrc;
+ const U32 dmsSize = (U32)(dmsEnd - dmsBase);
+ const U32 dmsIndexDelta = dictLimit - dmsSize;
+
+ { U32 const head = *dmsTagRow & rowMask;
+ U32 matchBuffer[32 /* maximum nb row entries */];
+ size_t numMatches = 0;
+ size_t currMatch = 0;
+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
+
+ for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
+ U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+ U32 const matchIndex = dmsRow[matchPos];
+ if (matchIndex < dmsLowestIndex)
+ break;
+ PREFETCH_L1(dmsBase + matchIndex);
+ matchBuffer[numMatches++] = matchIndex;
+ }
+
+ /* Return the longest match */
+ for (; currMatch < numMatches; ++currMatch) {
+ U32 const matchIndex = matchBuffer[currMatch];
+ size_t currentMl=0;
+ assert(matchIndex >= dmsLowestIndex);
+ assert(matchIndex < curr);
+
+ { const BYTE* const match = dmsBase + matchIndex;
+ assert(match+4 <= dmsEnd);
+ if (MEM_read32(match) == MEM_read32(ip))
+ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+ }
+
+ if (currentMl > ml) {
+ ml = currentMl;
+ *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+ if (ip+currentMl == iLimit) break;
+ }
+ }
+ }
+ }
+ return ml;
+}
+
+/* Inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
+{
+ switch(ms->cParams.minMatch)
+ {
+ default : /* includes case 3 */
+ case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
+ case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
+ case 7 :
+ case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
+ }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+ switch(cappedSearchLog)
+ {
+ default :
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
+ }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+ switch(cappedSearchLog)
+ {
+ default :
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
+ }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+ switch(cappedSearchLog)
+ {
+ default :
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
+ }
+}
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* const iLimit,
+ size_t* offsetPtr)
+{
+ const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
+ switch(cappedSearchLog)
+ {
+ default :
+ case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
+ case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
+ }
+}
+
/* *******************************
* Common parser - lazy strategy
*********************************/
-typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
+typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_lazy_generic(
@@ -863,10 +1525,11 @@ ZSTD_compressBlock_lazy_generic(
const BYTE* ip = istart;
const BYTE* anchor = istart;
const BYTE* const iend = istart + srcSize;
- const BYTE* const ilimit = iend - 8;
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
const BYTE* const base = ms->window.base;
const U32 prefixLowestIndex = ms->window.dictLimit;
const BYTE* const prefixLowest = base + prefixLowestIndex;
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
typedef size_t (*searchMax_f)(
ZSTD_matchState_t* ms,
@@ -878,26 +1541,30 @@ ZSTD_compressBlock_lazy_generic(
* that should never occur (extDict modes go to the other implementation
* below and there is no DDSS for binary tree search yet).
*/
- const searchMax_f searchFuncs[4][2] = {
+ const searchMax_f searchFuncs[4][3] = {
{
ZSTD_HcFindBestMatch_selectMLS,
- ZSTD_BtFindBestMatch_selectMLS
+ ZSTD_BtFindBestMatch_selectMLS,
+ ZSTD_RowFindBestMatch_selectRowLog
},
{
NULL,
+ NULL,
NULL
},
{
ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
- ZSTD_BtFindBestMatch_dictMatchState_selectMLS
+ ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
+ ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
},
{
ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
- NULL
+ NULL,
+ ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
}
};
- searchMax_f const searchMax = searchFuncs[dictMode][searchMethod == search_binaryTree];
+ searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
const int isDMS = dictMode == ZSTD_dictMatchState;
@@ -915,9 +1582,7 @@ ZSTD_compressBlock_lazy_generic(
assert(searchMax != NULL);
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
-
- /* init */
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
ip += (dictAndPrefixLength == 0);
if (dictMode == ZSTD_noDict) {
U32 const curr = (U32)(ip - base);
@@ -933,6 +1598,12 @@ ZSTD_compressBlock_lazy_generic(
assert(offset_2 <= dictAndPrefixLength);
}
+ if (searchMethod == search_rowHash) {
+ ZSTD_row_fillHashCache(ms, base, rowLog,
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+ ms->nextToUpdate, ilimit);
+ }
+
/* Match Loop */
#if defined(__GNUC__) && defined(__x86_64__)
/* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
@@ -1198,6 +1869,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
}
+/* Row-based matchfinder */
+size_t ZSTD_compressBlock_lazy2_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+}
+
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+}
FORCE_INLINE_TEMPLATE
size_t ZSTD_compressBlock_lazy_extDict_generic(
@@ -1210,7 +1945,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const BYTE* ip = istart;
const BYTE* anchor = istart;
const BYTE* const iend = istart + srcSize;
- const BYTE* const ilimit = iend - 8;
+ const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
const BYTE* const base = ms->window.base;
const U32 dictLimit = ms->window.dictLimit;
const BYTE* const prefixStart = base + dictLimit;
@@ -1218,18 +1953,28 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const BYTE* const dictEnd = dictBase + dictLimit;
const BYTE* const dictStart = dictBase + ms->window.lowLimit;
const U32 windowLog = ms->cParams.windowLog;
+ const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
typedef size_t (*searchMax_f)(
ZSTD_matchState_t* ms,
const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
- searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
-
+ const searchMax_f searchFuncs[3] = {
+ ZSTD_HcFindBestMatch_extDict_selectMLS,
+ ZSTD_BtFindBestMatch_extDict_selectMLS,
+ ZSTD_RowFindBestMatch_extDict_selectRowLog
+ };
+ searchMax_f searchMax = searchFuncs[(int)searchMethod];
U32 offset_1 = rep[0], offset_2 = rep[1];
- DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
+ DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
/* init */
ip += (ip == prefixStart);
+ if (searchMethod == search_rowHash) {
+ ZSTD_row_fillHashCache(ms, base, rowLog,
+ MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
+ ms->nextToUpdate, ilimit);
+ }
/* Match Loop */
#if defined(__GNUC__) && defined(__x86_64__)
@@ -1249,7 +1994,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const U32 repIndex = (U32)(curr+1 - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
+ & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
/* repcode detected we should take it */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1280,7 +2026,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const U32 repIndex = (U32)(curr - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
if (MEM_read32(ip) == MEM_read32(repMatch)) {
/* repcode detected */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1311,7 +2058,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const U32 repIndex = (U32)(curr - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
+ & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
if (MEM_read32(ip) == MEM_read32(repMatch)) {
/* repcode detected */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1357,7 +2105,8 @@ _storeSequence:
const U32 repIndex = repCurrent - offset_2;
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
- if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */
+ if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
+ & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
if (MEM_read32(ip) == MEM_read32(repMatch)) {
/* repcode detected we should take it */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -1410,3 +2159,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
{
return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
}
+
+size_t ZSTD_compressBlock_greedy_extDict_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+{
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+
+{
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize)
+
+{
+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+}