summaryrefslogtreecommitdiff
path: root/thirdparty/zstd/compress
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/zstd/compress')
-rw-r--r--thirdparty/zstd/compress/clevels.h134
-rw-r--r--thirdparty/zstd/compress/fse_compress.c90
-rw-r--r--thirdparty/zstd/compress/huf_compress.c641
-rw-r--r--thirdparty/zstd/compress/zstd_compress.c680
-rw-r--r--thirdparty/zstd/compress/zstd_compress_internal.h257
-rw-r--r--thirdparty/zstd/compress/zstd_compress_literals.c7
-rw-r--r--thirdparty/zstd/compress/zstd_compress_literals.h4
-rw-r--r--thirdparty/zstd/compress/zstd_compress_sequences.c29
-rw-r--r--thirdparty/zstd/compress/zstd_compress_superblock.c7
-rw-r--r--thirdparty/zstd/compress/zstd_cwksp.h68
-rw-r--r--thirdparty/zstd/compress/zstd_double_fast.c415
-rw-r--r--thirdparty/zstd/compress/zstd_fast.c439
-rw-r--r--thirdparty/zstd/compress/zstd_lazy.c1044
-rw-r--r--thirdparty/zstd/compress/zstd_ldm.c16
-rw-r--r--thirdparty/zstd/compress/zstd_ldm.h2
-rw-r--r--thirdparty/zstd/compress/zstd_ldm_geartab.h5
-rw-r--r--thirdparty/zstd/compress/zstd_opt.c397
-rw-r--r--thirdparty/zstd/compress/zstdmt_compress.c114
-rw-r--r--thirdparty/zstd/compress/zstdmt_compress.h5
19 files changed, 2711 insertions, 1643 deletions
diff --git a/thirdparty/zstd/compress/clevels.h b/thirdparty/zstd/compress/clevels.h
new file mode 100644
index 0000000000..7ed2e00490
--- /dev/null
+++ b/thirdparty/zstd/compress/clevels.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CLEVELS_H
+#define ZSTD_CLEVELS_H
+
+#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressionParameters */
+#include "../zstd.h"
+
+/*-===== Pre-defined compression levels =====-*/
+
+#define ZSTD_MAX_CLEVEL 22
+
+#ifdef __GNUC__
+__attribute__((__unused__))
+#endif
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{ /* "default" - for any srcSize > 256 KB */
+ /* W, C, H, S, L, TL, strat */
+ { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */
+ { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */
+ { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */
+ { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */
+ { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */
+ { 21, 18, 19, 3, 5, 2, ZSTD_greedy }, /* level 5 */
+ { 21, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6 */
+ { 21, 19, 20, 4, 5, 8, ZSTD_lazy }, /* level 7 */
+ { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 8 */
+ { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */
+ { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 10 */
+ { 22, 21, 22, 6, 5, 16, ZSTD_lazy2 }, /* level 11 */
+ { 22, 22, 23, 6, 5, 32, ZSTD_lazy2 }, /* level 12 */
+ { 22, 22, 22, 4, 5, 32, ZSTD_btlazy2 }, /* level 13 */
+ { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */
+ { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */
+ { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */
+ { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */
+ { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */
+ { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */
+ { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */
+ { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */
+ { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */
+},
+{ /* for srcSize <= 256 KB */
+ /* W, C, H, S, L, T, strat */
+ { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
+ { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */
+ { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */
+ { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */
+ { 18, 16, 17, 3, 5, 2, ZSTD_greedy }, /* level 4.*/
+ { 18, 17, 18, 5, 5, 2, ZSTD_greedy }, /* level 5.*/
+ { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/
+ { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */
+ { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */
+ { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */
+ { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */
+ { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/
+ { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/
+ { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */
+ { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/
+ { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/
+ { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/
+ { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/
+ { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/
+ { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/
+ { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/
+ { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/
+ { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/
+},
+{ /* for srcSize <= 128 KB */
+ /* W, C, H, S, L, T, strat */
+ { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
+ { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */
+ { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */
+ { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */
+ { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */
+ { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */
+ { 17, 16, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */
+ { 17, 16, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */
+ { 17, 16, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */
+ { 17, 16, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */
+ { 17, 16, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */
+ { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */
+ { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */
+ { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/
+ { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/
+ { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/
+ { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/
+ { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/
+ { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/
+ { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/
+ { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/
+ { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/
+ { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/
+},
+{ /* for srcSize <= 16 KB */
+ /* W, C, H, S, L, T, strat */
+ { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
+ { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */
+ { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */
+ { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */
+ { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */
+ { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/
+ { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */
+ { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */
+ { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/
+ { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/
+ { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/
+ { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/
+ { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/
+ { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/
+ { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/
+ { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/
+ { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/
+ { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/
+ { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/
+ { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/
+ { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/
+ { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/
+ { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/
+},
+};
+
+
+
+#endif /* ZSTD_CLEVELS_H */
diff --git a/thirdparty/zstd/compress/fse_compress.c b/thirdparty/zstd/compress/fse_compress.c
index b4297ec88a..5547b4ac09 100644
--- a/thirdparty/zstd/compress/fse_compress.c
+++ b/thirdparty/zstd/compress/fse_compress.c
@@ -75,13 +75,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
U32 const step = FSE_TABLESTEP(tableSize);
+ U32 const maxSV1 = maxSymbolValue+1;
- U32* cumul = (U32*)workSpace;
- FSE_FUNCTION_TYPE* tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSymbolValue + 2));
+ U16* cumul = (U16*)workSpace; /* size = maxSV1 */
+ FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)(cumul + (maxSV1+1)); /* size = tableSize */
U32 highThreshold = tableSize-1;
- if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte aligned */
+ assert(((size_t)workSpace & 1) == 0); /* Must be 2 bytes-aligned */
if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSize) return ERROR(tableLog_tooLarge);
/* CTable header */
tableU16[-2] = (U16) tableLog;
@@ -98,20 +99,61 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
/* symbol start positions */
{ U32 u;
cumul[0] = 0;
- for (u=1; u <= maxSymbolValue+1; u++) {
+ for (u=1; u <= maxSV1; u++) {
if (normalizedCounter[u-1]==-1) { /* Low proba symbol */
cumul[u] = cumul[u-1] + 1;
tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
} else {
- cumul[u] = cumul[u-1] + normalizedCounter[u-1];
+ assert(normalizedCounter[u-1] >= 0);
+ cumul[u] = cumul[u-1] + (U16)normalizedCounter[u-1];
+ assert(cumul[u] >= cumul[u-1]); /* no overflow */
} }
- cumul[maxSymbolValue+1] = tableSize+1;
+ cumul[maxSV1] = (U16)(tableSize+1);
}
/* Spread symbols */
- { U32 position = 0;
+ if (highThreshold == tableSize - 1) {
+ /* Case for no low prob count symbols. Lay down 8 bytes at a time
+ * to reduce branch misses since we are operating on a small block
+ */
+ BYTE* const spread = tableSymbol + tableSize; /* size = tableSize + 8 (may write beyond tableSize) */
+ { U64 const add = 0x0101010101010101ull;
+ size_t pos = 0;
+ U64 sv = 0;
+ U32 s;
+ for (s=0; s<maxSV1; ++s, sv += add) {
+ int i;
+ int const n = normalizedCounter[s];
+ MEM_write64(spread + pos, sv);
+ for (i = 8; i < n; i += 8) {
+ MEM_write64(spread + pos + i, sv);
+ }
+ assert(n>=0);
+ pos += (size_t)n;
+ }
+ }
+ /* Spread symbols across the table. Lack of lowprob symbols means that
+ * we don't need variable sized inner loop, so we can unroll the loop and
+ * reduce branch misses.
+ */
+ { size_t position = 0;
+ size_t s;
+ size_t const unroll = 2; /* Experimentally determined optimal unroll */
+ assert(tableSize % unroll == 0); /* FSE_MIN_TABLELOG is 5 */
+ for (s = 0; s < (size_t)tableSize; s += unroll) {
+ size_t u;
+ for (u = 0; u < unroll; ++u) {
+ size_t const uPosition = (position + (u * step)) & tableMask;
+ tableSymbol[uPosition] = spread[s + u];
+ }
+ position = (position + (unroll * step)) & tableMask;
+ }
+ assert(position == 0); /* Must have initialized all positions */
+ }
+ } else {
+ U32 position = 0;
U32 symbol;
- for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+ for (symbol=0; symbol<maxSV1; symbol++) {
int nbOccurrences;
int const freq = normalizedCounter[symbol];
for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
@@ -120,7 +162,6 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
while (position > highThreshold)
position = (position + step) & tableMask; /* Low proba area */
} }
-
assert(position==0); /* Must have initialized all positions */
}
@@ -144,16 +185,17 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
case -1:
case 1:
symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
- symbolTT[s].deltaFindState = total - 1;
+ assert(total <= INT_MAX);
+ symbolTT[s].deltaFindState = (int)(total - 1);
total ++;
break;
default :
- {
- U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
- U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+ assert(normalizedCounter[s] > 1);
+ { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
+ U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
- symbolTT[s].deltaFindState = total - normalizedCounter[s];
- total += normalizedCounter[s];
+ symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
+ total += (unsigned)normalizedCounter[s];
} } } }
#if 0 /* debug : symbol costs */
@@ -164,32 +206,26 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
symbol, normalizedCounter[symbol],
FSE_getMaxNbBits(symbolTT, symbol),
(double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
- }
- }
+ } }
#endif
return 0;
}
-#ifndef ZSTD_NO_UNUSED_FUNCTIONS
-size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
-{
- FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE]; /* memset() is not necessary, even if static analyzer complain about it */
- return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
-}
-#endif
-
#ifndef FSE_COMMONDEFS_ONLY
-
/*-**************************************************************
* FSE NCount encoding
****************************************************************/
size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
{
- size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+ size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+ + 4 /* bitCount initialized at 4 */
+ + 2 /* first two symbols may use one additional bit each */) / 8)
+ + 1 /* round up to whole nb bytes */
+ + 2 /* additional two bytes for bitstream flush */;
return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */
}
diff --git a/thirdparty/zstd/compress/huf_compress.c b/thirdparty/zstd/compress/huf_compress.c
index 485906e678..2b3d6adc2a 100644
--- a/thirdparty/zstd/compress/huf_compress.c
+++ b/thirdparty/zstd/compress/huf_compress.c
@@ -53,6 +53,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
/* *******************************************************
* HUF : Huffman block compression
*********************************************************/
+#define HUF_WORKSPACE_MAX_ALIGNMENT 8
+
+static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align)
+{
+ size_t const mask = align - 1;
+ size_t const rem = (size_t)workspace & mask;
+ size_t const add = (align - rem) & mask;
+ BYTE* const aligned = (BYTE*)workspace + add;
+ assert((align & (align - 1)) == 0); /* pow 2 */
+ assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
+ if (*workspaceSizePtr >= add) {
+ assert(add < align);
+ assert(((size_t)aligned & mask) == 0);
+ *workspaceSizePtr -= add;
+ return aligned;
+ } else {
+ *workspaceSizePtr = 0;
+ return NULL;
+ }
+}
+
+
/* HUF_compressWeights() :
* Same as FSE_compress(), but dedicated to huff0's weights compression.
* The use case needs much less stack memory.
@@ -75,7 +97,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
unsigned maxSymbolValue = HUF_TABLELOG_MAX;
U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
- HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace;
+ HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC);
@@ -106,6 +128,40 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
return (size_t)(op-ostart);
}
+static size_t HUF_getNbBits(HUF_CElt elt)
+{
+ return elt & 0xFF;
+}
+
+static size_t HUF_getNbBitsFast(HUF_CElt elt)
+{
+ return elt;
+}
+
+static size_t HUF_getValue(HUF_CElt elt)
+{
+ return elt & ~0xFF;
+}
+
+static size_t HUF_getValueFast(HUF_CElt elt)
+{
+ return elt;
+}
+
+static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
+{
+ assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
+ *elt = nbBits;
+}
+
+static void HUF_setValue(HUF_CElt* elt, size_t value)
+{
+ size_t const nbBits = HUF_getNbBits(*elt);
+ if (nbBits > 0) {
+ assert((value >> nbBits) == 0);
+ *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
+ }
+}
typedef struct {
HUF_CompressWeightsWksp wksp;
@@ -117,9 +173,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
void* workspace, size_t workspaceSize)
{
+ HUF_CElt const* const ct = CTable + 1;
BYTE* op = (BYTE*)dst;
U32 n;
- HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace;
+ HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
/* check conditions */
if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
@@ -130,9 +187,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
for (n=1; n<huffLog+1; n++)
wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
for (n=0; n<maxSymbolValue; n++)
- wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits];
+ wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
/* attempt weights compression by FSE */
+ if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
{ CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );
if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */
op[0] = (BYTE)hSize;
@@ -166,6 +224,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
U32 tableLog = 0;
U32 nbSymbols = 0;
+ HUF_CElt* const ct = CTable + 1;
/* get symbol weights */
CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
@@ -175,6 +234,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+ CTable[0] = tableLog;
+
/* Prepare base value per rank */
{ U32 n, nextRankStart = 0;
for (n=1; n<=tableLog; n++) {
@@ -186,13 +247,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
/* fill nbBits */
{ U32 n; for (n=0; n<nbSymbols; n++) {
const U32 w = huffWeight[n];
- CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0);
+ HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
} }
/* fill val */
{ U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */
U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
- { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
+ { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }
/* determine stating value per rank */
valPerRank[tableLog+1] = 0; /* for w==0 */
{ U16 min = 0;
@@ -202,18 +263,18 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
min >>= 1;
} }
/* assign value within rank, symbol order */
- { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
+ { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
}
*maxSymbolValuePtr = nbSymbols - 1;
return readSize;
}
-U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
+U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
{
- const HUF_CElt* table = (const HUF_CElt*)symbolTable;
+ const HUF_CElt* ct = CTable + 1;
assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
- return table[symbolValue].nbBits;
+ return (U32)HUF_getNbBits(ct[symbolValue]);
}
@@ -367,22 +428,118 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
}
typedef struct {
- U32 base;
- U32 curr;
+ U16 base;
+ U16 curr;
} rankPos;
typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
-#define RANK_POSITION_TABLE_SIZE 32
+/* Number of buckets available for HUF_sort() */
+#define RANK_POSITION_TABLE_SIZE 192
typedef struct {
huffNodeTable huffNodeTbl;
rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
} HUF_buildCTable_wksp_tables;
+/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
+ * Strategy is to use as many buckets as possible for representing distinct
+ * counts while using the remainder to represent all "large" counts.
+ *
+ * To satisfy this requirement for 192 buckets, we can do the following:
+ * Let buckets 0-166 represent distinct counts of [0, 166]
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
+ */
+#define RANK_POSITION_MAX_COUNT_LOG 32
+#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
+
+/* Return the appropriate bucket index for a given count. See definition of
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
+ */
+static U32 HUF_getIndex(U32 const count) {
+ return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+ ? count
+ : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+}
+
+/* Helper swap function for HUF_quickSortPartition() */
+static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
+ nodeElt tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+/* Returns 0 if the huffNode array is not sorted by descending count */
+MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) {
+ U32 i;
+ for (i = 1; i < maxSymbolValue1; ++i) {
+ if (huffNode[i].count > huffNode[i-1].count) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/* Insertion sort by descending order */
+HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) {
+ int i;
+ int const size = high-low+1;
+ huffNode += low;
+ for (i = 1; i < size; ++i) {
+ nodeElt const key = huffNode[i];
+ int j = i - 1;
+ while (j >= 0 && huffNode[j].count < key.count) {
+ huffNode[j + 1] = huffNode[j];
+ j--;
+ }
+ huffNode[j + 1] = key;
+ }
+}
+
+/* Pivot helper function for quicksort. */
+static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) {
+ /* Simply select rightmost element as pivot. "Better" selectors like
+ * median-of-three don't experimentally appear to have any benefit.
+ */
+ U32 const pivot = arr[high].count;
+ int i = low - 1;
+ int j = low;
+ for ( ; j < high; j++) {
+ if (arr[j].count > pivot) {
+ i++;
+ HUF_swapNodes(&arr[i], &arr[j]);
+ }
+ }
+ HUF_swapNodes(&arr[i + 1], &arr[high]);
+ return i + 1;
+}
+
+/* Classic quicksort by descending with partially iterative calls
+ * to reduce worst case callstack size.
+ */
+static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
+ int const kInsertionSortThreshold = 8;
+ if (high - low < kInsertionSortThreshold) {
+ HUF_insertionSort(arr, low, high);
+ return;
+ }
+ while (low < high) {
+ int const idx = HUF_quickSortPartition(arr, low, high);
+ if (idx - low < high - idx) {
+ HUF_simpleQuickSort(arr, low, idx - 1);
+ low = idx + 1;
+ } else {
+ HUF_simpleQuickSort(arr, idx + 1, high);
+ high = idx - 1;
+ }
+ }
+}
+
/**
* HUF_sort():
* Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
*
* @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
* Must have (maxSymbolValue + 1) entries.
@@ -390,44 +547,52 @@ typedef struct {
* @param[in] maxSymbolValue Maximum symbol value.
* @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
*/
-static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition)
-{
- int n;
- int const maxSymbolValue1 = (int)maxSymbolValue + 1;
+static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
+ U32 n;
+ U32 const maxSymbolValue1 = maxSymbolValue+1;
/* Compute base and set curr to base.
- * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1.
- * Then 2^lowerRank <= count[n]+1 <= 2^rank.
+ * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
+ * See HUF_getIndex to see bucketing strategy.
* We attribute each symbol to lowerRank's base value, because we want to know where
* each rank begins in the output, so for rank R we want to count ranks R+1 and above.
*/
ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
for (n = 0; n < maxSymbolValue1; ++n) {
- U32 lowerRank = BIT_highbit32(count[n] + 1);
+ U32 lowerRank = HUF_getIndex(count[n]);
+ assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
rankPosition[lowerRank].base++;
}
+
assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
+ /* Set up the rankPosition table */
for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
rankPosition[n-1].base += rankPosition[n].base;
rankPosition[n-1].curr = rankPosition[n-1].base;
}
- /* Sort */
+
+ /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
for (n = 0; n < maxSymbolValue1; ++n) {
U32 const c = count[n];
- U32 const r = BIT_highbit32(c+1) + 1;
- U32 pos = rankPosition[r].curr++;
- /* Insert into the correct position in the rank.
- * We have at most 256 symbols, so this insertion should be fine.
- */
- while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) {
- huffNode[pos] = huffNode[pos-1];
- pos--;
- }
+ U32 const r = HUF_getIndex(c) + 1;
+ U32 const pos = rankPosition[r].curr++;
+ assert(pos < maxSymbolValue1);
huffNode[pos].count = c;
huffNode[pos].byte = (BYTE)n;
}
-}
+ /* Sort each bucket. */
+ for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
+ U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
+ U32 const bucketStartIdx = rankPosition[n].base;
+ if (bucketSize > 1) {
+ assert(bucketStartIdx < maxSymbolValue1);
+ HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1);
+ }
+ }
+
+ assert(HUF_isSorted(huffNode, maxSymbolValue1));
+}
/** HUF_buildCTable_wksp() :
* Same as HUF_buildCTable(), but using externally allocated scratch buffer.
@@ -490,6 +655,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
*/
static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
{
+ HUF_CElt* const ct = CTable + 1;
/* fill result into ctable (val, nbBits) */
int n;
U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
@@ -505,20 +671,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
min >>= 1;
} }
for (n=0; n<alphabetSize; n++)
- CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
+ HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */
for (n=0; n<alphabetSize; n++)
- CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */
+ HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */
+ CTable[0] = maxNbBits;
}
-size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
{
- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace;
+ HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
nodeElt* const huffNode = huffNode0+1;
int nonNullRank;
/* safety checks */
- if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
return ERROR(workSpace_tooSmall);
if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
@@ -536,96 +702,334 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo
maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
- HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
+ HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
return maxNbBits;
}
size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
{
+ HUF_CElt const* ct = CTable + 1;
size_t nbBits = 0;
int s;
for (s = 0; s <= (int)maxSymbolValue; ++s) {
- nbBits += CTable[s].nbBits * count[s];
+ nbBits += HUF_getNbBits(ct[s]) * count[s];
}
return nbBits >> 3;
}
int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+ HUF_CElt const* ct = CTable + 1;
int bad = 0;
int s;
for (s = 0; s <= (int)maxSymbolValue; ++s) {
- bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
}
return !bad;
}
size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+/** HUF_CStream_t:
+ * Huffman uses its own BIT_CStream_t implementation.
+ * There are three major differences from BIT_CStream_t:
+ * 1. HUF_addBits() takes a HUF_CElt (size_t) which is
+ * the pair (nbBits, value) in the format:
+ * format:
+ * - Bits [0, 4) = nbBits
+ * - Bits [4, 64 - nbBits) = 0
+ * - Bits [64 - nbBits, 64) = value
+ * 2. The bitContainer is built from the upper bits and
+ * right shifted. E.g. to add a new value of N bits
+ * you right shift the bitContainer by N, then or in
+ * the new value into the N upper bits.
+ * 3. The bitstream has two bit containers. You can add
+ * bits to the second container and merge them into
+ * the first container.
+ */
+
+#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
+
+typedef struct {
+ size_t bitContainer[2];
+ size_t bitPos[2];
+
+ BYTE* startPtr;
+ BYTE* ptr;
+ BYTE* endPtr;
+} HUF_CStream_t;
+
+/**! HUF_initCStream():
+ * Initializes the bitstream.
+ * @returns 0 or an error code.
+ */
+static size_t HUF_initCStream(HUF_CStream_t* bitC,
+ void* startPtr, size_t dstCapacity)
+{
+ ZSTD_memset(bitC, 0, sizeof(*bitC));
+ bitC->startPtr = (BYTE*)startPtr;
+ bitC->ptr = bitC->startPtr;
+ bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
+ if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall);
+ return 0;
+}
+
+/*! HUF_addBits():
+ * Adds the symbol stored in HUF_CElt elt to the bitstream.
+ *
+ * @param elt The element we're adding. This is a (nbBits, value) pair.
+ * See the HUF_CStream_t docs for the format.
+ * @param idx Insert into the bitstream at this idx.
+ * @param kFast This is a template parameter. If the bitstream is guaranteed
+ * to have at least 4 unused bits after this call it may be 1,
+ * otherwise it must be 0. HUF_addBits() is faster when fast is set.
+ */
+FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast)
+{
+ assert(idx <= 1);
+ assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
+ /* This is efficient on x86-64 with BMI2 because shrx
+ * only reads the low 6 bits of the register. The compiler
+ * knows this and elides the mask. When fast is set,
+ * every operation can use the same value loaded from elt.
+ */
+ bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
+ bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
+ /* We only read the low 8 bits of bitC->bitPos[idx] so it
+ * doesn't matter that the high bits have noise from the value.
+ */
+ bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
+ assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+ /* The last 4-bits of elt are dirty if fast is set,
+ * so we must not be overwriting bits that have already been
+ * inserted into the bit container.
+ */
+#if DEBUGLEVEL >= 1
+ {
+ size_t const nbBits = HUF_getNbBits(elt);
+ size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
+ (void)dirtyBits;
+ /* Middle bits are 0. */
+ assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
+ /* We didn't overwrite any bits in the bit container. */
+ assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+ (void)dirtyBits;
+ }
+#endif
+}
+
+FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
+{
+ bitC->bitContainer[1] = 0;
+ bitC->bitPos[1] = 0;
+}
+
+/*! HUF_mergeIndex1() :
+ * Merges the bit container @ index 1 into the bit container @ index 0
+ * and zeros the bit container @ index 1.
+ */
+FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
+{
+ assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
+ bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
+ bitC->bitContainer[0] |= bitC->bitContainer[1];
+ bitC->bitPos[0] += bitC->bitPos[1];
+ assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
+}
+
+/*! HUF_flushBits() :
+* Flushes the bits in the bit container @ index 0.
+*
+* @post bitPos will be < 8.
+* @param kFast If kFast is set then we must know a-priori that
+* the bit container will not overflow.
+*/
+FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
+{
+ /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
+ size_t const nbBytes = nbBits >> 3;
+ /* The top nbBits bits of bitContainer are the ones we need. */
+ size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
+ /* Mask bitPos to account for the bytes we consumed. */
+ bitC->bitPos[0] &= 7;
+ assert(nbBits > 0);
+ assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
+ assert(bitC->ptr <= bitC->endPtr);
+ MEM_writeLEST(bitC->ptr, bitContainer);
+ bitC->ptr += nbBytes;
+ assert(!kFast || bitC->ptr <= bitC->endPtr);
+ if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+ /* bitContainer doesn't need to be modified because the leftover
+ * bits are already the top bitPos bits. And we don't care about
+ * noise in the lower values.
+ */
+}
+
+/*! HUF_endMark()
+ * @returns The Huffman stream end mark: A 1-bit value = 1.
+ */
+static HUF_CElt HUF_endMark(void)
+{
+ HUF_CElt endMark;
+ HUF_setNbBits(&endMark, 1);
+ HUF_setValue(&endMark, 1);
+ return endMark;
+}
+
+/*! HUF_closeCStream() :
+ * @return Size of CStream, in bytes,
+ * or 0 if it could not fit into dstBuffer */
+static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+{
+ HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
+ HUF_flushBits(bitC, /* kFast */ 0);
+ {
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
+ if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+ return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
+ }
+}
+
FORCE_INLINE_TEMPLATE void
-HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)
{
- BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+ HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
}
-#define HUF_FLUSHBITS(s) BIT_flushBits(s)
+FORCE_INLINE_TEMPLATE void
+HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
+ const BYTE* ip, size_t srcSize,
+ const HUF_CElt* ct,
+ int kUnroll, int kFastFlush, int kLastFast)
+{
+ /* Join to kUnroll */
+ int n = (int)srcSize;
+ int rem = n % kUnroll;
+ if (rem > 0) {
+ for (; rem > 0; --rem) {
+ HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
+ }
+ HUF_flushBits(bitC, kFastFlush);
+ }
+ assert(n % kUnroll == 0);
+
+ /* Join to 2 * kUnroll */
+ if (n % (2 * kUnroll)) {
+ int u;
+ for (u = 1; u < kUnroll; ++u) {
+ HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
+ }
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
+ HUF_flushBits(bitC, kFastFlush);
+ n -= kUnroll;
+ }
+ assert(n % (2 * kUnroll) == 0);
+
+ for (; n>0; n-= 2 * kUnroll) {
+ /* Encode kUnroll symbols into the bitstream @ index 0. */
+ int u;
+ for (u = 1; u < kUnroll; ++u) {
+ HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
+ }
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
+ HUF_flushBits(bitC, kFastFlush);
+ /* Encode kUnroll symbols into the bitstream @ index 1.
+ * This allows us to start filling the bit container
+ * without any data dependencies.
+ */
+ HUF_zeroIndex1(bitC);
+ for (u = 1; u < kUnroll; ++u) {
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
+ }
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
+ /* Merge bitstream @ index 1 into the bitstream @ index 0 */
+ HUF_mergeIndex1(bitC);
+ HUF_flushBits(bitC, kFastFlush);
+ }
+ assert(n == 0);
+
+}
-#define HUF_FLUSHBITS_1(stream) \
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
+/**
+ * Returns a tight upper bound on the output space needed by Huffman
+ * with 8 bytes buffer to handle over-writes. If the output is at least
+ * this large we don't need to do bounds checks during Huffman encoding.
+ */
+static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
+{
+ return ((srcSize * tableLog) >> 3) + 8;
+}
-#define HUF_FLUSHBITS_2(stream) \
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
FORCE_INLINE_TEMPLATE size_t
HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
const void* src, size_t srcSize,
const HUF_CElt* CTable)
{
+ U32 const tableLog = (U32)CTable[0];
+ HUF_CElt const* ct = CTable + 1;
const BYTE* ip = (const BYTE*) src;
BYTE* const ostart = (BYTE*)dst;
BYTE* const oend = ostart + dstSize;
BYTE* op = ostart;
- size_t n;
- BIT_CStream_t bitC;
+ HUF_CStream_t bitC;
/* init */
if (dstSize < 8) return 0; /* not enough space to compress */
- { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op));
+ { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
if (HUF_isError(initErr)) return 0; }
- n = srcSize & ~3; /* join to mod 4 */
- switch (srcSize & 3)
- {
- case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
- HUF_FLUSHBITS_2(&bitC);
- /* fall-through */
- case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
- HUF_FLUSHBITS_1(&bitC);
- /* fall-through */
- case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
- HUF_FLUSHBITS(&bitC);
- /* fall-through */
- case 0 : /* fall-through */
- default: break;
- }
-
- for (; n>0; n-=4) { /* note : n&3==0 at this stage */
- HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
- HUF_FLUSHBITS_1(&bitC);
- HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
- HUF_FLUSHBITS_2(&bitC);
- HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
- HUF_FLUSHBITS_1(&bitC);
- HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
- HUF_FLUSHBITS(&bitC);
+ if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
+ else {
+ if (MEM_32bits()) {
+ switch (tableLog) {
+ case 11:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
+ break;
+ case 10: ZSTD_FALLTHROUGH;
+ case 9: ZSTD_FALLTHROUGH;
+ case 8:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
+ break;
+ case 7: ZSTD_FALLTHROUGH;
+ default:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
+ break;
+ }
+ } else {
+ switch (tableLog) {
+ case 11:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
+ break;
+ case 10:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
+ break;
+ case 9:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
+ break;
+ case 8:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
+ break;
+ case 7:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
+ break;
+ case 6: ZSTD_FALLTHROUGH;
+ default:
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
+ break;
+ }
+ }
}
+ assert(bitC.ptr <= bitC.endPtr);
- return BIT_closeCStream(&bitC);
+ return HUF_closeCStream(&bitC);
}
#if DYNAMIC_BMI2
-static TARGET_ATTRIBUTE("bmi2") size_t
+static BMI2_TARGET_ATTRIBUTE size_t
HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
const void* src, size_t srcSize,
const HUF_CElt* CTable)
@@ -667,9 +1071,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
{
- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+ return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
}
+size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+{
+ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+}
static size_t
HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
@@ -689,8 +1097,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
assert(op <= oend);
{ CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
- if (cSize==0) return 0;
- assert(cSize <= 65535);
+ if (cSize == 0 || cSize > 65535) return 0;
MEM_writeLE16(ostart, (U16)cSize);
op += cSize;
}
@@ -698,8 +1105,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
ip += segmentSize;
assert(op <= oend);
{ CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
- if (cSize==0) return 0;
- assert(cSize <= 65535);
+ if (cSize == 0 || cSize > 65535) return 0;
MEM_writeLE16(ostart+2, (U16)cSize);
op += cSize;
}
@@ -707,8 +1113,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
ip += segmentSize;
assert(op <= oend);
{ CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
- if (cSize==0) return 0;
- assert(cSize <= 65535);
+ if (cSize == 0 || cSize > 65535) return 0;
MEM_writeLE16(ostart+4, (U16)cSize);
op += cSize;
}
@@ -717,7 +1122,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
assert(op <= oend);
assert(ip <= iend);
{ CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
- if (cSize==0) return 0;
+ if (cSize == 0 || cSize > 65535) return 0;
op += cSize;
}
@@ -726,7 +1131,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
{
- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+ return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+{
+ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
}
typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -750,35 +1160,38 @@ static size_t HUF_compressCTable_internal(
typedef struct {
unsigned count[HUF_SYMBOLVALUE_MAX + 1];
- HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+ HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
union {
HUF_buildCTable_wksp_tables buildCTable_wksp;
HUF_WriteCTableWksp writeCTable_wksp;
+ U32 hist_wksp[HIST_WKSP_SIZE_U32];
} wksps;
} HUF_compress_tables_t;
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */
+
/* HUF_compress_internal() :
* `workSpace_align4` must be aligned on 4-bytes boundaries,
- * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
static size_t
HUF_compress_internal (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
HUF_nbStreams_e nbStreams,
- void* workSpace_align4, size_t wkspSize,
+ void* workSpace, size_t wkspSize,
HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
- const int bmi2)
+ const int bmi2, unsigned suspectUncompressible)
{
- HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4;
+ HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
BYTE* const ostart = (BYTE*)dst;
BYTE* const oend = ostart + dstSize;
BYTE* op = ostart;
- HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE);
- assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */
+ HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
/* checks & inits */
- if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
+ if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
if (!srcSize) return 0; /* Uncompressed */
if (!dstSize) return 0; /* cannot fit anything within dst budget */
if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */
@@ -794,8 +1207,23 @@ HUF_compress_internal (void* dst, size_t dstSize,
nbStreams, oldHufTable, bmi2);
}
+ /* If uncompressible data is suspected, do a smaller sampling first */
+ DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
+ if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+ size_t largestTotal = 0;
+ { unsigned maxSymbolValueBegin = maxSymbolValue;
+ CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+ largestTotal += largestBegin;
+ }
+ { unsigned maxSymbolValueEnd = maxSymbolValue;
+ CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+ largestTotal += largestEnd;
+ }
+ if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */
+ }
+
/* Scan input and build symbol stats */
- { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) );
+ { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );
if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
}
@@ -820,9 +1248,12 @@ HUF_compress_internal (void* dst, size_t dstSize,
&table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
CHECK_F(maxBits);
huffLog = (U32)maxBits;
- /* Zero unused symbols in CTable, so we can check it for validity */
- ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0,
- sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
+ }
+ /* Zero unused symbols in CTable, so we can check it for validity */
+ {
+ size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+ size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+ ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
}
/* Write table description header */
@@ -859,19 +1290,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_singleStream,
workSpace, wkspSize,
- NULL, NULL, 0, 0 /*bmi2*/);
+ NULL, NULL, 0, 0 /*bmi2*/, 0);
}
size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
+ int bmi2, unsigned suspectUncompressible)
{
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_singleStream,
workSpace, wkspSize, hufTable,
- repeat, preferRepeat, bmi2);
+ repeat, preferRepeat, bmi2, suspectUncompressible);
}
/* HUF_compress4X_repeat():
@@ -885,22 +1317,23 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_fourStreams,
workSpace, wkspSize,
- NULL, NULL, 0, 0 /*bmi2*/);
+ NULL, NULL, 0, 0 /*bmi2*/, 0);
}
/* HUF_compress4X_repeat():
* compress input using 4 streams.
+ * consider skipping quickly
* re-use an existing huffman compression table */
size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog,
void* workSpace, size_t wkspSize,
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
{
return HUF_compress_internal(dst, dstSize, src, srcSize,
maxSymbolValue, huffLog, HUF_fourStreams,
workSpace, wkspSize,
- hufTable, repeat, preferRepeat, bmi2);
+ hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
}
#ifndef ZSTD_NO_UNUSED_FUNCTIONS
@@ -918,7 +1351,7 @@ size_t HUF_compress1X (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog)
{
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+ U64 workSpace[HUF_WORKSPACE_SIZE_U64];
return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
}
@@ -926,7 +1359,7 @@ size_t HUF_compress2 (void* dst, size_t dstSize,
const void* src, size_t srcSize,
unsigned maxSymbolValue, unsigned huffLog)
{
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+ U64 workSpace[HUF_WORKSPACE_SIZE_U64];
return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
}
diff --git a/thirdparty/zstd/compress/zstd_compress.c b/thirdparty/zstd/compress/zstd_compress.c
index b7ee2980a7..f06456af92 100644
--- a/thirdparty/zstd/compress/zstd_compress.c
+++ b/thirdparty/zstd/compress/zstd_compress.c
@@ -12,7 +12,6 @@
* Dependencies
***************************************/
#include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
-#include "../common/cpu.h"
#include "../common/mem.h"
#include "hist.h" /* HIST_countFast_wksp */
#define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */
@@ -42,6 +41,18 @@
# define ZSTD_COMPRESS_HEAPMODE 0
#endif
+/*!
+ * ZSTD_HASHLOG3_MAX :
+ * Maximum size of the hash table dedicated to find 3-bytes matches,
+ * in log format, aka 17 => 1 << 17 == 128Ki positions.
+ * This structure is only used in zstd_opt.
+ * Since allocation is centralized for all strategies, it has to be known here.
+ * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3,
+ * so that zstd_opt.c doesn't need to know about this constant.
+ */
+#ifndef ZSTD_HASHLOG3_MAX
+# define ZSTD_HASHLOG3_MAX 17
+#endif
/*-*************************************
* Helper functions
@@ -72,10 +83,10 @@ struct ZSTD_CDict_s {
ZSTD_customMem customMem;
U32 dictID;
int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
- ZSTD_useRowMatchFinderMode_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
- * row-based matchfinder. Unless the cdict is reloaded, we will use
- * the same greedy/lazy matchfinder at compression time.
- */
+ ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use
+ * row-based matchfinder. Unless the cdict is reloaded, we will use
+ * the same greedy/lazy matchfinder at compression time.
+ */
}; /* typedef'd to ZSTD_CDict within "zstd.h" */
ZSTD_CCtx* ZSTD_createCCtx(void)
@@ -88,7 +99,7 @@ static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
assert(cctx != NULL);
ZSTD_memset(cctx, 0, sizeof(*cctx));
cctx->customMem = memManager;
- cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+ cctx->bmi2 = ZSTD_cpuSupportsBmi2();
{ size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
assert(!ZSTD_isError(err));
(void)err;
@@ -214,35 +225,42 @@ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
/* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder
* for this compression.
*/
-static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_useRowMatchFinderMode_e mode) {
- assert(mode != ZSTD_urm_auto);
- return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_urm_enableRowMatchFinder);
+static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) {
+ assert(mode != ZSTD_ps_auto);
+ return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable);
}
-/* Returns row matchfinder usage enum given an initial mode and cParams */
-static ZSTD_useRowMatchFinderMode_e ZSTD_resolveRowMatchFinderMode(ZSTD_useRowMatchFinderMode_e mode,
- const ZSTD_compressionParameters* const cParams) {
-#if !defined(ZSTD_NO_INTRINSICS) && (defined(__SSE2__) || defined(__ARM_NEON))
+/* Returns row matchfinder usage given an initial mode and cParams */
+static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode,
+ const ZSTD_compressionParameters* const cParams) {
+#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON)
int const kHasSIMD128 = 1;
#else
int const kHasSIMD128 = 0;
#endif
- if (mode != ZSTD_urm_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
- mode = ZSTD_urm_disableRowMatchFinder;
+ if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */
+ mode = ZSTD_ps_disable;
if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode;
if (kHasSIMD128) {
- if (cParams->windowLog > 14) mode = ZSTD_urm_enableRowMatchFinder;
+ if (cParams->windowLog > 14) mode = ZSTD_ps_enable;
} else {
- if (cParams->windowLog > 17) mode = ZSTD_urm_enableRowMatchFinder;
+ if (cParams->windowLog > 17) mode = ZSTD_ps_enable;
}
return mode;
}
+/* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */
+static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode,
+ const ZSTD_compressionParameters* const cParams) {
+ if (mode != ZSTD_ps_auto) return mode;
+ return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable;
+}
+
/* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */
static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
- const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+ const ZSTD_paramSwitch_e useRowMatchFinder,
const U32 forDDSDict) {
- assert(useRowMatchFinder != ZSTD_urm_auto);
+ assert(useRowMatchFinder != ZSTD_ps_auto);
/* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate.
* We do not allocate a chaintable if we are using ZSTD_fast, or are using the row-based matchfinder.
*/
@@ -253,16 +271,10 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
* enable long distance matching (wlog >= 27, strategy >= btopt).
* Returns 0 otherwise.
*/
-static U32 ZSTD_CParams_shouldEnableLdm(const ZSTD_compressionParameters* const cParams) {
- return cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27;
-}
-
-/* Returns 1 if compression parameters are such that we should
- * enable blockSplitter (wlog >= 17, strategy >= btopt).
- * Returns 0 otherwise.
- */
-static U32 ZSTD_CParams_useBlockSplitter(const ZSTD_compressionParameters* const cParams) {
- return cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17;
+static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+ const ZSTD_compressionParameters* const cParams) {
+ if (mode != ZSTD_ps_auto) return mode;
+ return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
}
static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
@@ -274,20 +286,13 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
cctxParams.cParams = cParams;
/* Adjust advanced params according to cParams */
- if (ZSTD_CParams_shouldEnableLdm(&cParams)) {
- DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including LDM into cctx params");
- cctxParams.ldmParams.enableLdm = 1;
- /* LDM is enabled by default for optimal parser and window size >= 128MB */
+ cctxParams.ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams.ldmParams.enableLdm, &cParams);
+ if (cctxParams.ldmParams.enableLdm == ZSTD_ps_enable) {
ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams);
assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog);
assert(cctxParams.ldmParams.hashRateLog < 32);
}
-
- if (ZSTD_CParams_useBlockSplitter(&cParams)) {
- DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including block splitting into cctx params");
- cctxParams.splitBlocks = 1;
- }
-
+ cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
assert(!ZSTD_checkCParams(cParams));
return cctxParams;
@@ -348,7 +353,10 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
*/
cctxParams->compressionLevel = compressionLevel;
cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
- DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d", cctxParams->useRowMatchFinder);
+ cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
+ cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
+ DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
+ cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
}
size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
@@ -518,9 +526,9 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
return bounds;
case ZSTD_c_literalCompressionMode:
- ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed);
- bounds.lowerBound = ZSTD_lcm_auto;
- bounds.upperBound = ZSTD_lcm_uncompressed;
+ ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable < ZSTD_ps_disable);
+ bounds.lowerBound = (int)ZSTD_ps_auto;
+ bounds.upperBound = (int)ZSTD_ps_disable;
return bounds;
case ZSTD_c_targetCBlockSize:
@@ -549,14 +557,14 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
bounds.upperBound = 1;
return bounds;
- case ZSTD_c_splitBlocks:
- bounds.lowerBound = 0;
- bounds.upperBound = 1;
+ case ZSTD_c_useBlockSplitter:
+ bounds.lowerBound = (int)ZSTD_ps_auto;
+ bounds.upperBound = (int)ZSTD_ps_disable;
return bounds;
case ZSTD_c_useRowMatchFinder:
- bounds.lowerBound = (int)ZSTD_urm_auto;
- bounds.upperBound = (int)ZSTD_urm_enableRowMatchFinder;
+ bounds.lowerBound = (int)ZSTD_ps_auto;
+ bounds.upperBound = (int)ZSTD_ps_disable;
return bounds;
case ZSTD_c_deterministicRefPrefix:
@@ -625,7 +633,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
case ZSTD_c_stableOutBuffer:
case ZSTD_c_blockDelimiters:
case ZSTD_c_validateSequences:
- case ZSTD_c_splitBlocks:
+ case ZSTD_c_useBlockSplitter:
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
default:
@@ -680,7 +688,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
case ZSTD_c_stableOutBuffer:
case ZSTD_c_blockDelimiters:
case ZSTD_c_validateSequences:
- case ZSTD_c_splitBlocks:
+ case ZSTD_c_useBlockSplitter:
case ZSTD_c_useRowMatchFinder:
case ZSTD_c_deterministicRefPrefix:
break;
@@ -780,7 +788,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
}
case ZSTD_c_literalCompressionMode : {
- const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value;
+ const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
CCtxParams->literalCompressionMode = lcm;
return CCtxParams->literalCompressionMode;
@@ -835,7 +843,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
return CCtxParams->enableDedicatedDictSearch;
case ZSTD_c_enableLongDistanceMatching :
- CCtxParams->ldmParams.enableLdm = (value!=0);
+ CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
return CCtxParams->ldmParams.enableLdm;
case ZSTD_c_ldmHashLog :
@@ -857,8 +865,8 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
return CCtxParams->ldmParams.bucketSizeLog;
case ZSTD_c_ldmHashRateLog :
- RETURN_ERROR_IF(value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN,
- parameter_outOfBound, "Param out of bounds!");
+ if (value!=0) /* 0 ==> default */
+ BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
CCtxParams->ldmParams.hashRateLog = value;
return CCtxParams->ldmParams.hashRateLog;
@@ -894,14 +902,14 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
CCtxParams->validateSequences = value;
return CCtxParams->validateSequences;
- case ZSTD_c_splitBlocks:
- BOUNDCHECK(ZSTD_c_splitBlocks, value);
- CCtxParams->splitBlocks = value;
- return CCtxParams->splitBlocks;
+ case ZSTD_c_useBlockSplitter:
+ BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+ CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value;
+ return CCtxParams->useBlockSplitter;
case ZSTD_c_useRowMatchFinder:
BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
- CCtxParams->useRowMatchFinder = (ZSTD_useRowMatchFinderMode_e)value;
+ CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value;
return CCtxParams->useRowMatchFinder;
case ZSTD_c_deterministicRefPrefix:
@@ -1032,8 +1040,8 @@ size_t ZSTD_CCtxParams_getParameter(
case ZSTD_c_validateSequences :
*value = (int)CCtxParams->validateSequences;
break;
- case ZSTD_c_splitBlocks :
- *value = (int)CCtxParams->splitBlocks;
+ case ZSTD_c_useBlockSplitter :
+ *value = (int)CCtxParams->useBlockSplitter;
break;
case ZSTD_c_useRowMatchFinder :
*value = (int)CCtxParams->useRowMatchFinder;
@@ -1067,7 +1075,7 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
return 0;
}
-ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
{
DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
@@ -1147,14 +1155,14 @@ size_t ZSTD_CCtx_loadDictionary_advanced(
return 0;
}
-ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(
+size_t ZSTD_CCtx_loadDictionary_byReference(
ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
{
return ZSTD_CCtx_loadDictionary_advanced(
cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
}
-ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
{
return ZSTD_CCtx_loadDictionary_advanced(
cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
@@ -1324,7 +1332,7 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
break;
case ZSTD_cpm_createCDict:
/* Assume a small source size when creating a dictionary
- * with an unkown source size.
+ * with an unknown source size.
*/
if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
srcSize = minSrcSize;
@@ -1398,7 +1406,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
srcSizeHint = CCtxParams->srcSizeHint;
}
cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode);
- if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+ if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
assert(!ZSTD_checkCParams(cParams));
/* srcSizeHint == 0 means 0 */
@@ -1407,7 +1415,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
static size_t
ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
- const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+ const ZSTD_paramSwitch_e useRowMatchFinder,
const U32 enableDedicatedDictSearch,
const U32 forCCtx)
{
@@ -1440,7 +1448,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
/* tables are guaranteed to be sized in multiples of 64 bytes (or 16 uint32_t) */
ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >= 4 && ZSTD_WINDOWLOG_MIN >= 4 && ZSTD_CHAINLOG_MIN >= 4);
- assert(useRowMatchFinder != ZSTD_urm_auto);
+ assert(useRowMatchFinder != ZSTD_ps_auto);
DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
(U32)chainSize, (U32)hSize, (U32)h3Size);
@@ -1451,12 +1459,12 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
const ZSTD_compressionParameters* cParams,
const ldmParams_t* ldmParams,
const int isStatic,
- const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+ const ZSTD_paramSwitch_e useRowMatchFinder,
const size_t buffInSize,
const size_t buffOutSize,
const U64 pledgedSrcSize)
{
- size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << cParams->windowLog), pledgedSrcSize));
+ size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
U32 const divider = (cParams->minMatch==3) ? 3 : 4;
size_t const maxNbSeq = blockSize / divider;
@@ -1469,7 +1477,7 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams);
size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize);
- size_t const ldmSeqSpace = ldmParams->enableLdm ?
+ size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ?
ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
@@ -1496,8 +1504,8 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
{
ZSTD_compressionParameters const cParams =
ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
- ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
- &cParams);
+ ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder,
+ &cParams);
RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
/* estimateCCtxSize is for one-shot compression. So no buffers should
@@ -1514,9 +1522,9 @@ size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
/* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
size_t noRowCCtxSize;
size_t rowCCtxSize;
- initialParams.useRowMatchFinder = ZSTD_urm_disableRowMatchFinder;
+ initialParams.useRowMatchFinder = ZSTD_ps_disable;
noRowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
- initialParams.useRowMatchFinder = ZSTD_urm_enableRowMatchFinder;
+ initialParams.useRowMatchFinder = ZSTD_ps_enable;
rowCCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
return MAX(noRowCCtxSize, rowCCtxSize);
} else {
@@ -1561,7 +1569,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered)
? ZSTD_compressBound(blockSize) + 1
: 0;
- ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
+ ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, &params->cParams);
return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
&cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
@@ -1576,9 +1584,9 @@ size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
/* Pick bigger of not using and using row-based matchfinder for greedy and lazy strategies */
size_t noRowCCtxSize;
size_t rowCCtxSize;
- initialParams.useRowMatchFinder = ZSTD_urm_disableRowMatchFinder;
+ initialParams.useRowMatchFinder = ZSTD_ps_disable;
noRowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
- initialParams.useRowMatchFinder = ZSTD_urm_enableRowMatchFinder;
+ initialParams.useRowMatchFinder = ZSTD_ps_enable;
rowCCtxSize = ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
return MAX(noRowCCtxSize, rowCCtxSize);
} else {
@@ -1713,7 +1721,7 @@ static size_t
ZSTD_reset_matchState(ZSTD_matchState_t* ms,
ZSTD_cwksp* ws,
const ZSTD_compressionParameters* cParams,
- const ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+ const ZSTD_paramSwitch_e useRowMatchFinder,
const ZSTD_compResetPolicy_e crp,
const ZSTD_indexResetPolicy_e forceResetIndex,
const ZSTD_resetTarget_e forWho)
@@ -1728,7 +1736,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
- assert(useRowMatchFinder != ZSTD_urm_auto);
+ assert(useRowMatchFinder != ZSTD_ps_auto);
if (forceResetIndex == ZSTDirp_reset) {
ZSTD_window_init(&ms->window);
ZSTD_cwksp_mark_tables_dirty(ws);
@@ -1774,8 +1782,8 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
}
{ /* Switch to 32-entry rows if searchLog is 5 (or more) */
- U32 const rowLog = cParams->searchLog < 5 ? 4 : 5;
- assert(cParams->hashLog > rowLog);
+ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+ assert(cParams->hashLog >= rowLog);
ms->rowHashLog = cParams->hashLog - rowLog;
}
}
@@ -1824,8 +1832,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
ZSTD_buffered_policy_e const zbuff)
{
ZSTD_cwksp* const ws = &zc->workspace;
- DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d",
- (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder);
+ DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d",
+ (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter);
assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
zc->isFirstBlock = 1;
@@ -1836,8 +1844,10 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
zc->appliedParams = *params;
params = &zc->appliedParams;
- assert(params->useRowMatchFinder != ZSTD_urm_auto);
- if (params->ldmParams.enableLdm) {
+ assert(params->useRowMatchFinder != ZSTD_ps_auto);
+ assert(params->useBlockSplitter != ZSTD_ps_auto);
+ assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
/* Adjust long distance matching parameters */
ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
assert(params->ldmParams.hashLog >= params->ldmParams.bucketSizeLog);
@@ -1900,7 +1910,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE);
- RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
+ RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
} }
ZSTD_cwksp_clear(ws);
@@ -1937,7 +1947,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);
/* ldm bucketOffsets table */
- if (params->ldmParams.enableLdm) {
+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
/* TODO: avoid memset? */
size_t const numBuckets =
((size_t)1) << (params->ldmParams.hashLog -
@@ -1964,7 +1974,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
ZSTD_resetTarget_CCtx), "");
/* ldm hash table */
- if (params->ldmParams.enableLdm) {
+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
/* TODO: avoid memset? */
size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
@@ -1976,8 +1986,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
zc->ldmState.loadedDictEnd = 0;
}
- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
zc->initialized = 1;
@@ -2115,7 +2125,7 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
}
ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
- assert(params.useRowMatchFinder != ZSTD_urm_auto);
+ assert(params.useRowMatchFinder != ZSTD_ps_auto);
/* copy tables */
{ size_t const chainSize = ZSTD_allocateChainTable(cdict_cParams->strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */)
@@ -2209,8 +2219,12 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
{ ZSTD_CCtx_params params = dstCCtx->requestedParams;
/* Copy only compression parameters related to tables. */
params.cParams = srcCCtx->appliedParams.cParams;
- assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_urm_auto);
+ assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto);
+ assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto);
+ assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto);
params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder;
+ params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
+ params.ldmParams = srcCCtx->appliedParams.ldmParams;
params.fParams = fParams;
ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
/* loadedDictSize */ 0,
@@ -2296,6 +2310,8 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
int const nbRows = (int)size / ZSTD_ROWSIZE;
int cellNb = 0;
int rowNb;
+ /* Protect special index values < ZSTD_WINDOW_START_INDEX. */
+ U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX;
assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */
assert(size < (1U<<31)); /* can be casted to int */
@@ -2315,12 +2331,17 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa
for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
int column;
for (column=0; column<ZSTD_ROWSIZE; column++) {
- if (preserveMark) {
- U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
- table[cellNb] += adder;
+ U32 newVal;
+ if (preserveMark && table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) {
+ /* This write is pointless, but is required(?) for the compiler
+ * to auto-vectorize the loop. */
+ newVal = ZSTD_DUBT_UNSORTED_MARK;
+ } else if (table[cellNb] < reducerThreshold) {
+ newVal = 0;
+ } else {
+ newVal = table[cellNb] - reducerValue;
}
- if (table[cellNb] < reducerValue) table[cellNb] = 0;
- else table[cellNb] -= reducerValue;
+ table[cellNb] = newVal;
cellNb++;
} }
}
@@ -2375,9 +2396,9 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
assert(nbSeq <= seqStorePtr->maxNbSeq);
for (u=0; u<nbSeq; u++) {
U32 const llv = sequences[u].litLength;
- U32 const mlv = sequences[u].matchLength;
+ U32 const mlv = sequences[u].mlBase;
llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
- ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+ ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
}
if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
@@ -2399,11 +2420,13 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
/* ZSTD_blockSplitterEnabled():
* Returns if block splitting param is being used
* If used, compression will do best effort to split a block in order to improve compression ratio.
+ * At the time this function is called, the parameter must be finalized.
* Returns 1 if true, 0 otherwise. */
static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
{
- DEBUGLOG(5, "ZSTD_blockSplitterEnabled(splitBlocks=%d)", cctxParams->splitBlocks);
- return (cctxParams->splitBlocks != 0);
+ DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter);
+ assert(cctxParams->useBlockSplitter != ZSTD_ps_auto);
+ return (cctxParams->useBlockSplitter == ZSTD_ps_enable);
}
/* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types
@@ -2546,6 +2569,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
* compresses both literals and sequences
* Returns compressed size of block, or a zstd error.
*/
+#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
MEM_STATIC size_t
ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
const ZSTD_entropyCTables_t* prevEntropy,
@@ -2580,15 +2604,19 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
/* Compress literals */
{ const BYTE* const literals = seqStorePtr->litStart;
+ size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+ size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
+ /* Base suspicion of uncompressibility on ratio of literals to sequences */
+ unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
size_t const litSize = (size_t)(seqStorePtr->lit - literals);
size_t const cSize = ZSTD_compressLiterals(
&prevEntropy->huf, &nextEntropy->huf,
cctxParams->cParams.strategy,
- ZSTD_disableLiteralsCompression(cctxParams),
+ ZSTD_literalsCompressionIsDisabled(cctxParams),
op, dstCapacity,
literals, litSize,
entropyWorkspace, entropyWkspSize,
- bmi2);
+ bmi2, suspectUncompressible);
FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
assert(cSize <= dstCapacity);
op += cSize;
@@ -2693,7 +2721,7 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
/* ZSTD_selectBlockCompressor() :
* Not static, but internal use only (used by long distance matcher)
* assumption : strat is a valid strategy */
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_useRowMatchFinderMode_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
{
static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
{ ZSTD_compressBlock_fast /* default for 0 */,
@@ -2758,7 +2786,7 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_useRow
ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
};
DEBUGLOG(4, "Selecting a row-based matchfinder");
- assert(useRowMatchFinder != ZSTD_urm_auto);
+ assert(useRowMatchFinder != ZSTD_ps_auto);
selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy];
} else {
selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
@@ -2825,7 +2853,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
}
if (zc->externSeqStore.pos < zc->externSeqStore.size) {
- assert(!zc->appliedParams.ldmParams.enableLdm);
+ assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
/* Updates ldmSeqStore.pos */
lastLLSize =
ZSTD_ldm_blockCompress(&zc->externSeqStore,
@@ -2834,7 +2862,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
zc->appliedParams.useRowMatchFinder,
src, srcSize);
assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
- } else if (zc->appliedParams.ldmParams.enableLdm) {
+ } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
ldmSeqStore.seq = zc->ldmSequences;
@@ -2882,9 +2910,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
for (i = 0; i < seqStoreSeqSize; ++i) {
- U32 rawOffset = seqStoreSeqs[i].offset - ZSTD_REP_NUM;
+ U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
outSeqs[i].litLength = seqStoreSeqs[i].litLength;
- outSeqs[i].matchLength = seqStoreSeqs[i].matchLength + MINMATCH;
+ outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
outSeqs[i].rep = 0;
if (i == seqStore->longLengthPos) {
@@ -2895,9 +2923,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
}
}
- if (seqStoreSeqs[i].offset <= ZSTD_REP_NUM) {
+ if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
/* Derive the correct offset corresponding to a repcode */
- outSeqs[i].rep = seqStoreSeqs[i].offset;
+ outSeqs[i].rep = seqStoreSeqs[i].offBase;
if (outSeqs[i].litLength != 0) {
rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
} else {
@@ -2911,9 +2939,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
outSeqs[i].offset = rawOffset;
/* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
so we provide seqStoreSeqs[i].offset - 1 */
- updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep,
- seqStoreSeqs[i].offset - 1,
- seqStoreSeqs[i].litLength == 0);
+ ZSTD_updateRep(updatedRepcodes.rep,
+ seqStoreSeqs[i].offBase - 1,
+ seqStoreSeqs[i].litLength == 0);
literalsRead += outSeqs[i].litLength;
}
/* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
@@ -3027,7 +3055,7 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
const ZSTD_hufCTables_t* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_hufCTablesMetadata_t* hufMetadata,
- const int disableLiteralsCompression,
+ const int literalsCompressionIsDisabled,
void* workspace, size_t wkspSize)
{
BYTE* const wkspStart = (BYTE*)workspace;
@@ -3045,7 +3073,7 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
/* Prepare nextEntropy assuming reusing the existing table */
ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
- if (disableLiteralsCompression) {
+ if (literalsCompressionIsDisabled) {
DEBUGLOG(5, "set_basic - disabled");
hufMetadata->hType = set_basic;
return 0;
@@ -3192,7 +3220,7 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
&prevEntropy->huf, &nextEntropy->huf,
&entropyMetadata->hufMetadata,
- ZSTD_disableLiteralsCompression(cctxParams),
+ ZSTD_literalsCompressionIsDisabled(cctxParams),
workspace, wkspSize);
FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
entropyMetadata->fseMetadata.fseTablesSize =
@@ -3235,7 +3263,7 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
const FSE_CTable* fseCTable,
- const U32* additionalBits,
+ const U8* additionalBits,
short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
void* workspace, size_t wkspSize)
{
@@ -3319,19 +3347,20 @@ static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
*
* Returns the estimated compressed size of the seqStore, or a zstd error.
*/
-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, const ZSTD_CCtx* zc) {
- ZSTD_entropyCTablesMetadata_t entropyMetadata;
+static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
+ ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+ DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
&zc->blockState.prevCBlock->entropy,
&zc->blockState.nextCBlock->entropy,
&zc->appliedParams,
- &entropyMetadata,
+ entropyMetadata,
zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
(size_t)(seqStore->sequences - seqStore->sequencesStart),
- &zc->blockState.nextCBlock->entropy, &entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
- (int)(entropyMetadata.hufMetadata.hType == set_compressed), 1);
+ &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+ (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
}
/* Returns literals bytes represented in a seqStore */
@@ -3356,7 +3385,7 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
size_t i;
for (i = 0; i < nbSeqs; ++i) {
seqDef seq = seqStore->sequencesStart[i];
- matchBytes += seq.matchLength + MINMATCH;
+ matchBytes += seq.mlBase + MINMATCH;
if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
matchBytes += 0x10000;
}
@@ -3405,11 +3434,13 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
/**
* Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
- * offCode must be an offCode representing a repcode, therefore in the range of [0, 2].
+ * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
*/
-static U32 ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) {
- U32 const adjustedOffCode = offCode + ll0;
- assert(offCode < ZSTD_REP_NUM);
+static U32
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
+{
+ U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */
+ assert(STORED_IS_REPCODE(offCode));
if (adjustedOffCode == ZSTD_REP_NUM) {
/* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
assert(rep[0] > 0);
@@ -3420,11 +3451,16 @@ static U32 ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32
/**
* ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in offset history that may arise
- * due to emission of RLE/raw blocks that disturb the offset history, and replaces any repcodes within
- * the seqStore that may be invalid.
+ * due to emission of RLE/raw blocks that disturb the offset history,
+ * and replaces any repcodes within the seqStore that may be invalid.
+ *
+ * dRepcodes are updated as would be on the decompression side.
+ * cRepcodes are updated exactly in accordance with the seqStore.
*
- * dRepcodes are updated as would be on the decompression side. cRepcodes are updated exactly in
- * accordance with the seqStore.
+ * Note : this function assumes seq->offBase respects the following numbering scheme :
+ * 0 : invalid
+ * 1-3 : repcode 1-3
+ * 4+ : real_offset+3
*/
static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
seqStore_t* const seqStore, U32 const nbSeq) {
@@ -3432,9 +3468,9 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
for (; idx < nbSeq; ++idx) {
seqDef* const seq = seqStore->sequencesStart + idx;
U32 const ll0 = (seq->litLength == 0);
- U32 offCode = seq->offset - 1;
- assert(seq->offset > 0);
- if (offCode <= ZSTD_REP_MOVE) {
+ U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
+ assert(seq->offBase > 0);
+ if (STORED_IS_REPCODE(offCode)) {
U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
/* Adjust simulated decompression repcode history if we come across a mismatch. Replace
@@ -3442,14 +3478,14 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
* repcode history.
*/
if (dRawOffset != cRawOffset) {
- seq->offset = cRawOffset + ZSTD_REP_NUM;
+ seq->offBase = cRawOffset + ZSTD_REP_NUM;
}
}
/* Compression repcode history is always updated with values directly from the unmodified seqStore.
* Decompression repcode history may use modified seq->offset value taken from compression repcode history.
*/
- *dRepcodes = ZSTD_updateRep(dRepcodes->rep, seq->offset - 1, ll0);
- *cRepcodes = ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
+ ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
+ ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
}
}
@@ -3458,11 +3494,13 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
*
* Returns the total size of that block (including header) or a ZSTD error code.
*/
-static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
- repcodes_t* const dRep, repcodes_t* const cRep,
- void* dst, size_t dstCapacity,
- const void* src, size_t srcSize,
- U32 lastBlock, U32 isPartition) {
+static size_t
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+ repcodes_t* const dRep, repcodes_t* const cRep,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize,
+ U32 lastBlock, U32 isPartition)
+{
const U32 rleMaxLength = 25;
BYTE* op = (BYTE*)dst;
const BYTE* ip = (const BYTE*)src;
@@ -3471,9 +3509,11 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const
/* In case of an RLE or raw block, the simulated decompression repcode history must be reset */
repcodes_t const dRepOriginal = *dRep;
+ DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock");
if (isPartition)
ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart));
+ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "Block header doesn't fit");
cSeqsSize = ZSTD_entropyCompressSeqStore(seqStore,
&zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
&zc->appliedParams,
@@ -3499,9 +3539,6 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const
return 0;
}
- if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
- zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-
if (cSeqsSize == 0) {
cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
FORWARD_IF_ERROR(cSize, "Nocompress block failed");
@@ -3518,6 +3555,10 @@ static size_t ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const
cSize = ZSTD_blockHeaderSize + cSeqsSize;
DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
}
+
+ if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+ zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
return cSize;
}
@@ -3528,7 +3569,6 @@ typedef struct {
} seqStoreSplits;
#define MIN_SEQUENCES_BLOCK_SPLITTING 300
-#define MAX_NB_SPLITS 196
/* Helper function to perform the recursive search for block splits.
* Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
@@ -3539,29 +3579,33 @@ typedef struct {
* In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
* In practice, recursion depth usually doesn't go beyond 4.
*
- * Furthermore, the number of splits is capped by MAX_NB_SPLITS. At MAX_NB_SPLITS == 196 with the current existing blockSize
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
* maximum of 128 KB, this value is actually impossible to reach.
*/
-static void ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
- const ZSTD_CCtx* zc, const seqStore_t* origSeqStore) {
- seqStore_t fullSeqStoreChunk;
- seqStore_t firstHalfSeqStore;
- seqStore_t secondHalfSeqStore;
+static void
+ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
+ ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+{
+ seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+ seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+ seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
size_t estimatedOriginalSize;
size_t estimatedFirstHalfSize;
size_t estimatedSecondHalfSize;
size_t midIdx = (startIdx + endIdx)/2;
- if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= MAX_NB_SPLITS) {
+ if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
+ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
return;
}
- ZSTD_deriveSeqStoreChunk(&fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
- ZSTD_deriveSeqStoreChunk(&firstHalfSeqStore, origSeqStore, startIdx, midIdx);
- ZSTD_deriveSeqStoreChunk(&secondHalfSeqStore, origSeqStore, midIdx, endIdx);
- estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&fullSeqStoreChunk, zc);
- estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&firstHalfSeqStore, zc);
- estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&secondHalfSeqStore, zc);
- DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+ DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+ ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
+ ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
+ ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
+ estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
+ estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
+ estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
+ DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
return;
@@ -3596,17 +3640,19 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
*
* Returns combined size of all blocks (which includes headers), or a ZSTD error code.
*/
-static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) {
+static size_t
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
+ const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
+{
size_t cSize = 0;
const BYTE* ip = (const BYTE*)src;
BYTE* op = (BYTE*)dst;
- U32 partitions[MAX_NB_SPLITS];
size_t i = 0;
size_t srcBytesTotal = 0;
+ U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+ seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+ seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
- seqStore_t nextSeqStore;
- seqStore_t currSeqStore;
/* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
* may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
@@ -3626,6 +3672,7 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s
repcodes_t cRep;
ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+ ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
(unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
@@ -3643,36 +3690,36 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s
return cSizeSingleBlock;
}
- ZSTD_deriveSeqStoreChunk(&currSeqStore, &zc->seqStore, 0, partitions[0]);
+ ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
for (i = 0; i <= numSplits; ++i) {
size_t srcBytes;
size_t cSizeChunk;
U32 const lastPartition = (i == numSplits);
U32 lastBlockEntireSrc = 0;
- srcBytes = ZSTD_countSeqStoreLiteralsBytes(&currSeqStore) + ZSTD_countSeqStoreMatchBytes(&currSeqStore);
+ srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
srcBytesTotal += srcBytes;
if (lastPartition) {
/* This is the final partition, need to account for possible last literals */
srcBytes += blockSize - srcBytesTotal;
lastBlockEntireSrc = lastBlock;
} else {
- ZSTD_deriveSeqStoreChunk(&nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]);
+ ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitions[i], partitions[i+1]);
}
- cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, &currSeqStore,
+ cSizeChunk = ZSTD_compressSeqStore_singleBlock(zc, currSeqStore,
&dRep, &cRep,
op, dstCapacity,
ip, srcBytes,
lastBlockEntireSrc, 1 /* isPartition */);
- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(&currSeqStore, zc), cSizeChunk);
+ DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
ip += srcBytes;
op += cSizeChunk;
dstCapacity -= cSizeChunk;
cSize += cSizeChunk;
- currSeqStore = nextSeqStore;
+ *currSeqStore = *nextSeqStore;
assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
}
/* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
@@ -3682,14 +3729,17 @@ static size_t ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, s
return cSize;
}
-static size_t ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
- void* dst, size_t dstCapacity,
- const void* src, size_t srcSize, U32 lastBlock) {
+static size_t
+ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize, U32 lastBlock)
+{
const BYTE* ip = (const BYTE*)src;
BYTE* op = (BYTE*)dst;
U32 nbSeq;
size_t cSize;
DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+ assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable);
{ const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
@@ -3704,15 +3754,15 @@ static size_t ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart);
}
- assert(zc->appliedParams.splitBlocks == 1);
cSize = ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity, src, srcSize, lastBlock, nbSeq);
FORWARD_IF_ERROR(cSize, "Splitting blocks failed!");
return cSize;
}
-static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
- void* dst, size_t dstCapacity,
- const void* src, size_t srcSize, U32 frame)
+static size_t
+ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ void* dst, size_t dstCapacity,
+ const void* src, size_t srcSize, U32 frame)
{
/* This the upper bound for the length of an rle block.
* This isn't the actual upper bound. Finding the real threshold
@@ -3746,12 +3796,6 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
zc->bmi2);
- if (zc->seqCollector.collectSequences) {
- ZSTD_copyBlockSequences(zc);
- return 0;
- }
-
-
if (frame &&
/* We don't want to emit our first block as a RLE even if it qualifies because
* doing so will cause the decoder (cli only) to throw a "should consume all input error."
@@ -3915,6 +3959,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
ZSTD_overflowCorrectIfNeeded(
ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+ ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
/* Ensure hash/chain table insertion resumes no sooner than lowlimit */
if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
@@ -3991,7 +4036,9 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
if (!singleSegment) op[pos++] = windowLogByte;
switch(dictIDSizeCode)
{
- default: assert(0); /* impossible */
+ default:
+ assert(0); /* impossible */
+ ZSTD_FALLTHROUGH;
case 0 : break;
case 1 : op[pos] = (BYTE)(dictID); pos++; break;
case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
@@ -3999,7 +4046,9 @@ static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
}
switch(fcsCode)
{
- default: assert(0); /* impossible */
+ default:
+ assert(0); /* impossible */
+ ZSTD_FALLTHROUGH;
case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
@@ -4047,7 +4096,7 @@ size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSe
{
RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
"wrong cctx stage");
- RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm,
+ RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
parameter_unsupported,
"incompatible with ldm");
cctx->externSeqStore.seq = seq;
@@ -4088,7 +4137,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
ms->forceNonContiguous = 0;
ms->nextToUpdate = ms->window.dictLimit;
}
- if (cctx->appliedParams.ldmParams.enableLdm) {
+ if (cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceNonContiguous */ 0);
}
@@ -4157,7 +4206,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
{
const BYTE* ip = (const BYTE*) src;
const BYTE* const iend = ip + srcSize;
- int const loadLdmDict = params->ldmParams.enableLdm && ls != NULL;
+ int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
/* Assert that we the ms params match the params we're being given */
ZSTD_assertEqualCParams(params->cParams, ms->cParams);
@@ -4214,8 +4263,8 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
assert(ms->chainTable != NULL);
ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_READ_SIZE);
} else {
- assert(params->useRowMatchFinder != ZSTD_urm_auto);
- if (params->useRowMatchFinder == ZSTD_urm_enableRowMatchFinder) {
+ assert(params->useRowMatchFinder != ZSTD_ps_auto);
+ if (params->useRowMatchFinder == ZSTD_ps_enable) {
size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
ZSTD_memset(ms->tagTable, 0, tagTableSize);
ZSTD_row_update(ms, iend-HASH_READ_SIZE);
@@ -4715,7 +4764,7 @@ size_t ZSTD_estimateCDictSize_advanced(
+ ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
/* enableDedicatedDictSearch == 1 ensures that CDict estimation will not be too small
* in case we are using DDS with row-hash. */
- + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_urm_auto, &cParams),
+ + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams),
/* enableDedicatedDictSearch */ 1, /* forCCtx */ 0)
+ (dictLoadMethod == ZSTD_dlm_byRef ? 0
: ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *))));
@@ -4792,7 +4841,7 @@ static size_t ZSTD_initCDict_internal(
static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
ZSTD_dictLoadMethod_e dictLoadMethod,
ZSTD_compressionParameters cParams,
- ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+ ZSTD_paramSwitch_e useRowMatchFinder,
U32 enableDedicatedDictSearch,
ZSTD_customMem customMem)
{
@@ -4842,7 +4891,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
&cctxParams, customMem);
}
-ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(
+ZSTD_CDict* ZSTD_createCDict_advanced2(
const void* dict, size_t dictSize,
ZSTD_dictLoadMethod_e dictLoadMethod,
ZSTD_dictContentType_e dictContentType,
@@ -4947,7 +4996,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
ZSTD_dictContentType_e dictContentType,
ZSTD_compressionParameters cParams)
{
- ZSTD_useRowMatchFinderMode_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_urm_auto, &cParams);
+ ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams);
/* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */
size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
@@ -5403,7 +5452,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
zcs->outBuffFlushedSize = 0;
zcs->streamStage = zcss_flush; /* pass-through to flush stage */
}
- /* fall-through */
+ ZSTD_FALLTHROUGH;
case zcss_flush:
DEBUGLOG(5, "flush stage");
assert(zcs->appliedParams.outBufferMode == ZSTD_bm_buffered);
@@ -5524,17 +5573,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
dictSize, mode);
}
- if (ZSTD_CParams_shouldEnableLdm(&params.cParams)) {
- /* Enable LDM by default for optimal parser and window size >= 128MB */
- DEBUGLOG(4, "LDM enabled by default (window size >= 128MB, strategy >= btopt)");
- params.ldmParams.enableLdm = 1;
- }
-
- if (ZSTD_CParams_useBlockSplitter(&params.cParams)) {
- DEBUGLOG(4, "Block splitter enabled by default (window size >= 128K, strategy >= btopt)");
- params.splitBlocks = 1;
- }
-
+ params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
+ params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
#ifdef ZSTD_MULTITHREAD
@@ -5715,39 +5755,39 @@ typedef struct {
size_t posInSrc; /* Number of bytes given by sequences provided so far */
} ZSTD_sequencePosition;
-/* Returns a ZSTD error code if sequence is not valid */
-static size_t ZSTD_validateSequence(U32 offCode, U32 matchLength,
- size_t posInSrc, U32 windowLog, size_t dictSize, U32 minMatch) {
- size_t offsetBound;
- U32 windowSize = 1 << windowLog;
+/* ZSTD_validateSequence() :
+ * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+ * @returns a ZSTD error code if sequence is not valid
+ */
+static size_t
+ZSTD_validateSequence(U32 offCode, U32 matchLength,
+ size_t posInSrc, U32 windowLog, size_t dictSize)
+{
+ U32 const windowSize = 1 << windowLog;
/* posInSrc represents the amount of data the the decoder would decode up to this point.
* As long as the amount of data decoded is less than or equal to window size, offsets may be
* larger than the total length of output decoded in order to reference the dict, even larger than
* window size. After output surpasses windowSize, we're limited to windowSize offsets again.
*/
- offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
- RETURN_ERROR_IF(offCode > offsetBound + ZSTD_REP_MOVE, corruption_detected, "Offset too large!");
- RETURN_ERROR_IF(matchLength < minMatch, corruption_detected, "Matchlength too small");
+ size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
+ RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
+ RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
return 0;
}
/* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) {
- U32 offCode = rawOffset + ZSTD_REP_MOVE;
- U32 repCode = 0;
+static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+{
+ U32 offCode = STORE_OFFSET(rawOffset);
if (!ll0 && rawOffset == rep[0]) {
- repCode = 1;
+ offCode = STORE_REPCODE_1;
} else if (rawOffset == rep[1]) {
- repCode = 2 - ll0;
+ offCode = STORE_REPCODE(2 - ll0);
} else if (rawOffset == rep[2]) {
- repCode = 3 - ll0;
+ offCode = STORE_REPCODE(3 - ll0);
} else if (ll0 && rawOffset == rep[0] - 1) {
- repCode = 3;
- }
- if (repCode) {
- /* ZSTD_storeSeq expects a number in the range [0, 2] to represent a repcode */
- offCode = repCode - 1;
+ offCode = STORE_REPCODE_3;
}
return offCode;
}
@@ -5755,18 +5795,17 @@ static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32
/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
* ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
*/
-static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
- const void* src, size_t blockSize) {
+static size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+ ZSTD_sequencePosition* seqPos,
+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+ const void* src, size_t blockSize)
+{
U32 idx = seqPos->idx;
BYTE const* ip = (BYTE const*)(src);
const BYTE* const iend = ip + blockSize;
repcodes_t updatedRepcodes;
U32 dictSize;
- U32 litLength;
- U32 matchLength;
- U32 ll0;
- U32 offCode;
if (cctx->cdict) {
dictSize = (U32)cctx->cdict->dictContentSize;
@@ -5777,23 +5816,22 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZS
}
ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
- litLength = inSeqs[idx].litLength;
- matchLength = inSeqs[idx].matchLength;
- ll0 = litLength == 0;
- offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
- updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+ U32 const litLength = inSeqs[idx].litLength;
+ U32 const ll0 = (litLength == 0);
+ U32 const matchLength = inSeqs[idx].matchLength;
+ U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+ ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
if (cctx->appliedParams.validateSequences) {
seqPos->posInSrc += litLength + matchLength;
FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
- cctx->appliedParams.cParams.windowLog, dictSize,
- cctx->appliedParams.cParams.minMatch),
+ cctx->appliedParams.cParams.windowLog, dictSize),
"Sequence validation failed");
}
RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
"Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH);
+ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
ip += matchLength + litLength;
}
ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
@@ -5820,9 +5858,11 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ZS
* avoid splitting a match, or to avoid splitting a match such that it would produce a match
* smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
*/
-static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
- const void* src, size_t blockSize) {
+static size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+ const void* src, size_t blockSize)
+{
U32 idx = seqPos->idx;
U32 startPosInSequence = seqPos->posInSequence;
U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize;
@@ -5832,10 +5872,6 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq
repcodes_t updatedRepcodes;
U32 bytesAdjustment = 0;
U32 finalMatchSplit = 0;
- U32 litLength;
- U32 matchLength;
- U32 rawOffset;
- U32 offCode;
if (cctx->cdict) {
dictSize = cctx->cdict->dictContentSize;
@@ -5849,9 +5885,10 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq
ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
const ZSTD_Sequence currSeq = inSeqs[idx];
- litLength = currSeq.litLength;
- matchLength = currSeq.matchLength;
- rawOffset = currSeq.offset;
+ U32 litLength = currSeq.litLength;
+ U32 matchLength = currSeq.matchLength;
+ U32 const rawOffset = currSeq.offset;
+ U32 offCode;
/* Modify the sequence depending on where endPosInSequence lies */
if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
@@ -5904,22 +5941,21 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq
}
}
/* Check if this offset can be represented with a repcode */
- { U32 ll0 = (litLength == 0);
+ { U32 const ll0 = (litLength == 0);
offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
- updatedRepcodes = ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+ ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
}
if (cctx->appliedParams.validateSequences) {
seqPos->posInSrc += litLength + matchLength;
FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
- cctx->appliedParams.cParams.windowLog, dictSize,
- cctx->appliedParams.cParams.minMatch),
+ cctx->appliedParams.cParams.windowLog, dictSize),
"Sequence validation failed");
}
DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
"Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength - MINMATCH);
+ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
ip += matchLength + litLength;
}
DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
@@ -5944,7 +5980,8 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_seq
typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
const void* src, size_t blockSize);
-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) {
+static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+{
ZSTD_sequenceCopier sequenceCopier = NULL;
assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));
if (mode == ZSTD_sf_explicitBlockDelimiters) {
@@ -5958,12 +5995,15 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
/* Compress, block-by-block, all of the sequences given.
*
- * Returns the cumulative size of all compressed blocks (including their headers), otherwise a ZSTD error.
+ * Returns the cumulative size of all compressed blocks (including their headers),
+ * otherwise a ZSTD error.
*/
-static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
- void* dst, size_t dstCapacity,
- const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
- const void* src, size_t srcSize) {
+static size_t
+ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+ void* dst, size_t dstCapacity,
+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+ const void* src, size_t srcSize)
+{
size_t cSize = 0;
U32 lastBlock;
size_t blockSize;
@@ -5973,7 +6013,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
BYTE const* ip = (BYTE const*)src;
BYTE* op = (BYTE*)dst;
- ZSTD_sequenceCopier sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
+ ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters);
DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize);
/* Special case: empty frame */
@@ -6073,7 +6113,8 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
- const void* src, size_t srcSize) {
+ const void* src, size_t srcSize)
+{
BYTE* op = (BYTE*)dst;
size_t cSize = 0;
size_t compressedBlocksSize = 0;
@@ -6140,119 +6181,12 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
/*-===== Pre-defined compression levels =====-*/
+#include "clevels.h"
-#define ZSTD_MAX_CLEVEL 22
int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; }
-static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
-{ /* "default" - for any srcSize > 256 KB */
- /* W, C, H, S, L, TL, strat */
- { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */
- { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */
- { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */
- { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */
- { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */
- { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */
- { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */
- { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */
- { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */
- { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */
- { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */
- { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */
- { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */
- { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */
- { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */
- { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */
- { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */
- { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */
- { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */
- { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */
- { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */
- { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */
- { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */
-},
-{ /* for srcSize <= 256 KB */
- /* W, C, H, S, L, T, strat */
- { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
- { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */
- { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */
- { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */
- { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/
- { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/
- { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/
- { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */
- { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */
- { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */
- { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */
- { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/
- { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/
- { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */
- { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/
- { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/
- { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/
- { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/
- { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/
- { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/
- { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/
- { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/
- { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/
-},
-{ /* for srcSize <= 128 KB */
- /* W, C, H, S, L, T, strat */
- { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
- { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */
- { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */
- { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */
- { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */
- { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */
- { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */
- { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */
- { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */
- { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */
- { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */
- { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */
- { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */
- { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/
- { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/
- { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/
- { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/
- { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/
- { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/
- { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/
- { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/
- { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/
- { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/
-},
-{ /* for srcSize <= 16 KB */
- /* W, C, H, S, L, T, strat */
- { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */
- { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */
- { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */
- { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */
- { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */
- { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/
- { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */
- { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */
- { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/
- { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/
- { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/
- { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/
- { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/
- { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/
- { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/
- { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/
- { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/
- { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/
- { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/
- { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/
- { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/
- { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/
- { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/
-},
-};
-
static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int const compressionLevel, size_t const dictSize)
{
ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, 0, dictSize, ZSTD_cpm_createCDict);
diff --git a/thirdparty/zstd/compress/zstd_compress_internal.h b/thirdparty/zstd/compress/zstd_compress_internal.h
index 3b04fd09f6..c406e794bd 100644
--- a/thirdparty/zstd/compress/zstd_compress_internal.h
+++ b/thirdparty/zstd/compress/zstd_compress_internal.h
@@ -63,7 +63,7 @@ typedef struct {
} ZSTD_localDict;
typedef struct {
- HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)];
+ HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)];
HUF_repeat repeatMode;
} ZSTD_hufCTables_t;
@@ -129,7 +129,7 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
*********************************/
typedef struct {
- U32 off; /* Offset code (offset + ZSTD_REP_MOVE) for the match */
+ U32 off; /* Offset sumtype code for the match, using ZSTD_storeSeq() format */
U32 len; /* Raw length of match */
} ZSTD_match_t;
@@ -179,7 +179,7 @@ typedef struct {
U32 offCodeSumBasePrice; /* to compare to log2(offreq) */
ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */
const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */
- ZSTD_literalCompressionMode_e literalCompressionMode;
+ ZSTD_paramSwitch_e literalCompressionMode;
} optState_t;
typedef struct {
@@ -199,6 +199,8 @@ typedef struct {
*/
} ZSTD_window_t;
+#define ZSTD_WINDOW_START_INDEX 2
+
typedef struct ZSTD_matchState_t ZSTD_matchState_t;
#define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */
@@ -264,7 +266,7 @@ typedef struct {
} ldmState_t;
typedef struct {
- U32 enableLdm; /* 1 if enable long distance matching */
+ ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */
U32 hashLog; /* Log size of hashTable */
U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */
U32 minMatchLength; /* Minimum match length */
@@ -295,7 +297,7 @@ struct ZSTD_CCtx_params_s {
* There is no guarantee that hint is close to actual source size */
ZSTD_dictAttachPref_e attachDictPref;
- ZSTD_literalCompressionMode_e literalCompressionMode;
+ ZSTD_paramSwitch_e literalCompressionMode;
/* Multithreading: used to pass parameters to mtctx */
int nbWorkers;
@@ -318,10 +320,10 @@ struct ZSTD_CCtx_params_s {
int validateSequences;
/* Block splitting */
- int splitBlocks;
+ ZSTD_paramSwitch_e useBlockSplitter;
/* Param for deciding whether to use row-based matchfinder */
- ZSTD_useRowMatchFinderMode_e useRowMatchFinder;
+ ZSTD_paramSwitch_e useRowMatchFinder;
/* Always load a dictionary in ext-dict mode (not prefix mode)? */
int deterministicRefPrefix;
@@ -343,6 +345,22 @@ typedef enum {
ZSTDb_buffered
} ZSTD_buffered_policy_e;
+/**
+ * Struct that contains all elements of block splitter that should be allocated
+ * in a wksp.
+ */
+#define ZSTD_MAX_NB_BLOCK_SPLITS 196
+typedef struct {
+ seqStore_t fullSeqStoreChunk;
+ seqStore_t firstHalfSeqStore;
+ seqStore_t secondHalfSeqStore;
+ seqStore_t currSeqStore;
+ seqStore_t nextSeqStore;
+
+ U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS];
+ ZSTD_entropyCTablesMetadata_t entropyMetadata;
+} ZSTD_blockSplitCtx;
+
struct ZSTD_CCtx_s {
ZSTD_compressionStage_e stage;
int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
@@ -374,7 +392,7 @@ struct ZSTD_CCtx_s {
ZSTD_blockState_t blockState;
U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */
- /* Wether we are streaming or not */
+ /* Whether we are streaming or not */
ZSTD_buffered_policy_e bufferedPolicy;
/* streaming */
@@ -408,6 +426,9 @@ struct ZSTD_CCtx_s {
#if ZSTD_TRACE
ZSTD_TraceCtx traceCtx;
#endif
+
+ /* Workspace for block splitter */
+ ZSTD_blockSplitCtx blockSplitCtx;
};
typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
@@ -442,7 +463,7 @@ typedef enum {
typedef size_t (*ZSTD_blockCompressor) (
ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize);
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_useRowMatchFinderMode_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
@@ -476,31 +497,6 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
}
-typedef struct repcodes_s {
- U32 rep[3];
-} repcodes_t;
-
-MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
-{
- repcodes_t newReps;
- if (offset >= ZSTD_REP_NUM) { /* full offset */
- newReps.rep[2] = rep[1];
- newReps.rep[1] = rep[0];
- newReps.rep[0] = offset - ZSTD_REP_MOVE;
- } else { /* repcode */
- U32 const repCode = offset + ll0;
- if (repCode > 0) { /* note : if repCode==0, no change */
- U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
- newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2];
- newReps.rep[1] = rep[0];
- newReps.rep[0] = currentOffset;
- } else { /* repCode == 0 */
- ZSTD_memcpy(&newReps, rep, sizeof(newReps));
- }
- }
- return newReps;
-}
-
/* ZSTD_cParam_withinBounds:
* @return 1 if value is within cParam bounds,
* 0 otherwise */
@@ -549,17 +545,17 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
return (srcSize >> minlog) + 2;
}
-MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams)
+MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* cctxParams)
{
switch (cctxParams->literalCompressionMode) {
- case ZSTD_lcm_huffman:
+ case ZSTD_ps_enable:
return 0;
- case ZSTD_lcm_uncompressed:
+ case ZSTD_ps_disable:
return 1;
default:
assert(0 /* impossible: pre-validated */);
- /* fall-through */
- case ZSTD_lcm_auto:
+ ZSTD_FALLTHROUGH;
+ case ZSTD_ps_auto:
return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
}
}
@@ -569,7 +565,9 @@ MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParam
* Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single
* large copies.
*/
-static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) {
+static void
+ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w)
+{
assert(iend > ilimit_w);
if (ip <= ilimit_w) {
ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap);
@@ -579,14 +577,30 @@ static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const ie
while (ip < iend) *op++ = *ip++;
}
+#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1)
+#define STORE_REPCODE_1 STORE_REPCODE(1)
+#define STORE_REPCODE_2 STORE_REPCODE(2)
+#define STORE_REPCODE_3 STORE_REPCODE(3)
+#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
+#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE)
+#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE)
+#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
+#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */
+#define STORED_TO_OFFBASE(o) ((o)+1)
+#define OFFBASE_TO_STORED(o) ((o)-1)
+
/*! ZSTD_storeSeq() :
- * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t.
- * `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes).
- * `mlBase` : matchLength - MINMATCH
+ * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
+ * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
+ * @matchLength : must be >= MINMATCH
* Allowed to overread literals up to litLimit.
*/
-HINT_INLINE UNUSED_ATTR
-void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase)
+HINT_INLINE UNUSED_ATTR void
+ZSTD_storeSeq(seqStore_t* seqStorePtr,
+ size_t litLength, const BYTE* literals, const BYTE* litLimit,
+ U32 offBase_minus1,
+ size_t matchLength)
{
BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
BYTE const* const litEnd = literals + litLength;
@@ -595,7 +609,7 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera
if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */
{ U32 const pos = (U32)((const BYTE*)literals - g_start);
DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
- pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode);
+ pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
}
#endif
assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
@@ -626,19 +640,59 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* litera
seqStorePtr->sequences[0].litLength = (U16)litLength;
/* match offset */
- seqStorePtr->sequences[0].offset = offCode + 1;
+ seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
/* match Length */
- if (mlBase>0xFFFF) {
- assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
- seqStorePtr->longLengthType = ZSTD_llt_matchLength;
- seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ assert(matchLength >= MINMATCH);
+ { size_t const mlBase = matchLength - MINMATCH;
+ if (mlBase>0xFFFF) {
+ assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */
+ seqStorePtr->longLengthType = ZSTD_llt_matchLength;
+ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+ }
+ seqStorePtr->sequences[0].mlBase = (U16)mlBase;
}
- seqStorePtr->sequences[0].matchLength = (U16)mlBase;
seqStorePtr->sequences++;
}
+/* ZSTD_updateRep() :
+ * updates in-place @rep (array of repeat offsets)
+ * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
+ */
+MEM_STATIC void
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+{
+ if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */
+ rep[2] = rep[1];
+ rep[1] = rep[0];
+ rep[0] = STORED_OFFSET(offBase_minus1);
+ } else { /* repcode */
+ U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
+ if (repCode > 0) { /* note : if repCode==0, no change */
+ U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+ rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+ rep[1] = rep[0];
+ rep[0] = currentOffset;
+ } else { /* repCode == 0 */
+ /* nothing to do */
+ }
+ }
+}
+
+typedef struct repcodes_s {
+ U32 rep[3];
+} repcodes_t;
+
+MEM_STATIC repcodes_t
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+{
+ repcodes_t newReps;
+ ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+ ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
+ return newReps;
+}
+
/*-*************************************
* Match length counter
@@ -651,8 +705,14 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
# if STATIC_BMI2
return _tzcnt_u64(val) >> 3;
# else
- unsigned long r = 0;
- return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0;
+ if (val != 0) {
+ unsigned long r;
+ _BitScanForward64(&r, (U64)val);
+ return (unsigned)(r >> 3);
+ } else {
+ /* Should not reach this code path */
+ __assume(0);
+ }
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 4)
return (__builtin_ctzll((U64)val) >> 3);
@@ -669,8 +729,14 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
# endif
} else { /* 32 bits */
# if defined(_MSC_VER)
- unsigned long r=0;
- return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0;
+ if (val != 0) {
+ unsigned long r;
+ _BitScanForward(&r, (U32)val);
+ return (unsigned)(r >> 3);
+ } else {
+ /* Should not reach this code path */
+ __assume(0);
+ }
# elif defined(__GNUC__) && (__GNUC__ >= 3)
return (__builtin_ctz((U32)val) >> 3);
# else
@@ -687,8 +753,14 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
# if STATIC_BMI2
return _lzcnt_u64(val) >> 3;
# else
- unsigned long r = 0;
- return _BitScanReverse64(&r, (U64)val) ? (unsigned)(r >> 3) : 0;
+ if (val != 0) {
+ unsigned long r;
+ _BitScanReverse64(&r, (U64)val);
+ return (unsigned)(r >> 3);
+ } else {
+ /* Should not reach this code path */
+ __assume(0);
+ }
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 4)
return (__builtin_clzll(val) >> 3);
@@ -702,8 +774,14 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
# endif
} else { /* 32 bits */
# if defined(_MSC_VER)
- unsigned long r = 0;
- return _BitScanReverse( &r, (unsigned long)val ) ? (unsigned)(r >> 3) : 0;
+ if (val != 0) {
+ unsigned long r;
+ _BitScanReverse(&r, (unsigned long)val);
+ return (unsigned)(r >> 3);
+ } else {
+ /* Should not reach this code path */
+ __assume(0);
+ }
# elif defined(__GNUC__) && (__GNUC__ >= 3)
return (__builtin_clz((U32)val) >> 3);
# else
@@ -884,9 +962,9 @@ MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window)
{
- return window.dictLimit == 1 &&
- window.lowLimit == 1 &&
- (window.nextSrc - window.base) == 1;
+ return window.dictLimit == ZSTD_WINDOW_START_INDEX &&
+ window.lowLimit == ZSTD_WINDOW_START_INDEX &&
+ (window.nextSrc - window.base) == ZSTD_WINDOW_START_INDEX;
}
/**
@@ -937,7 +1015,9 @@ MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window,
{
U32 const cycleSize = 1u << cycleLog;
U32 const curr = (U32)((BYTE const*)src - window.base);
- U32 const minIndexToOverflowCorrect = cycleSize + MAX(maxDist, cycleSize);
+ U32 const minIndexToOverflowCorrect = cycleSize
+ + MAX(maxDist, cycleSize)
+ + ZSTD_WINDOW_START_INDEX;
/* Adjust the min index to backoff the overflow correction frequency,
* so we don't waste too much CPU in overflow correction. If this
@@ -1012,10 +1092,14 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
U32 const cycleSize = 1u << cycleLog;
U32 const cycleMask = cycleSize - 1;
U32 const curr = (U32)((BYTE const*)src - window->base);
- U32 const currentCycle0 = curr & cycleMask;
- /* Exclude zero so that newCurrent - maxDist >= 1. */
- U32 const currentCycle1 = currentCycle0 == 0 ? cycleSize : currentCycle0;
- U32 const newCurrent = currentCycle1 + MAX(maxDist, cycleSize);
+ U32 const currentCycle = curr & cycleMask;
+ /* Ensure newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX. */
+ U32 const currentCycleCorrection = currentCycle < ZSTD_WINDOW_START_INDEX
+ ? MAX(cycleSize, ZSTD_WINDOW_START_INDEX)
+ : 0;
+ U32 const newCurrent = currentCycle
+ + currentCycleCorrection
+ + MAX(maxDist, cycleSize);
U32 const correction = curr - newCurrent;
/* maxDist must be a power of two so that:
* (newCurrent & cycleMask) == (curr & cycleMask)
@@ -1031,14 +1115,20 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
window->base += correction;
window->dictBase += correction;
- if (window->lowLimit <= correction) window->lowLimit = 1;
- else window->lowLimit -= correction;
- if (window->dictLimit <= correction) window->dictLimit = 1;
- else window->dictLimit -= correction;
+ if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) {
+ window->lowLimit = ZSTD_WINDOW_START_INDEX;
+ } else {
+ window->lowLimit -= correction;
+ }
+ if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) {
+ window->dictLimit = ZSTD_WINDOW_START_INDEX;
+ } else {
+ window->dictLimit -= correction;
+ }
/* Ensure we can still reference the full window. */
assert(newCurrent >= maxDist);
- assert(newCurrent - maxDist >= 1);
+ assert(newCurrent - maxDist >= ZSTD_WINDOW_START_INDEX);
/* Ensure that lowLimit and dictLimit didn't underflow. */
assert(window->lowLimit <= newCurrent);
assert(window->dictLimit <= newCurrent);
@@ -1149,11 +1239,12 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
ZSTD_memset(window, 0, sizeof(*window));
- window->base = (BYTE const*)"";
- window->dictBase = (BYTE const*)"";
- window->dictLimit = 1; /* start from 1, so that 1st position is valid */
- window->lowLimit = 1; /* it ensures first and later CCtx usages compress the same */
- window->nextSrc = window->base + 1; /* see issue #1241 */
+ window->base = (BYTE const*)" ";
+ window->dictBase = (BYTE const*)" ";
+ ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX); /* Start above ZSTD_DUBT_UNSORTED_MARK */
+ window->dictLimit = ZSTD_WINDOW_START_INDEX; /* start from >0, so that 1st position is valid */
+ window->lowLimit = ZSTD_WINDOW_START_INDEX; /* it ensures first and later CCtx usages compress the same */
+ window->nextSrc = window->base + ZSTD_WINDOW_START_INDEX; /* see issue #1241 */
window->nbOverflowCorrections = 0;
}
@@ -1206,15 +1297,15 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
*/
MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog)
{
- U32 const maxDistance = 1U << windowLog;
- U32 const lowestValid = ms->window.lowLimit;
- U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
- U32 const isDictionary = (ms->loadedDictEnd != 0);
+ U32 const maxDistance = 1U << windowLog;
+ U32 const lowestValid = ms->window.lowLimit;
+ U32 const withinWindow = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
+ U32 const isDictionary = (ms->loadedDictEnd != 0);
/* When using a dictionary the entire dictionary is valid if a single byte of the dictionary
* is within the window. We invalidate the dictionary (and set loadedDictEnd to 0) when it isn't
* valid for the entire block. So this check is sufficient to find the lowest valid match index.
*/
- U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
+ U32 const matchLowest = isDictionary ? lowestValid : withinWindow;
return matchLowest;
}
diff --git a/thirdparty/zstd/compress/zstd_compress_literals.c b/thirdparty/zstd/compress/zstd_compress_literals.c
index 008337bb1b..52b0a8059a 100644
--- a/thirdparty/zstd/compress/zstd_compress_literals.c
+++ b/thirdparty/zstd/compress/zstd_compress_literals.c
@@ -73,7 +73,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
- const int bmi2)
+ const int bmi2,
+ unsigned suspectUncompressible)
{
size_t const minGain = ZSTD_minGain(srcSize, strategy);
size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
@@ -105,11 +106,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
HUF_compress1X_repeat(
ostart+lhSize, dstCapacity-lhSize, src, srcSize,
HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) :
+ (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
HUF_compress4X_repeat(
ostart+lhSize, dstCapacity-lhSize, src, srcSize,
HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
+ (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
if (repeat != HUF_repeat_none) {
/* reused the existing table */
DEBUGLOG(5, "Reusing previous huffman table");
diff --git a/thirdparty/zstd/compress/zstd_compress_literals.h b/thirdparty/zstd/compress/zstd_compress_literals.h
index 9904c0cd30..9775fb97cb 100644
--- a/thirdparty/zstd/compress/zstd_compress_literals.h
+++ b/thirdparty/zstd/compress/zstd_compress_literals.h
@@ -18,12 +18,14 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
ZSTD_hufCTables_t* nextHuf,
ZSTD_strategy strategy, int disableLiteralCompression,
void* dst, size_t dstCapacity,
const void* src, size_t srcSize,
void* entropyWorkspace, size_t entropyWorkspaceSize,
- const int bmi2);
+ const int bmi2,
+ unsigned suspectUncompressible);
#endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/thirdparty/zstd/compress/zstd_compress_sequences.c b/thirdparty/zstd/compress/zstd_compress_sequences.c
index 611eabdcbb..f1e40af2ea 100644
--- a/thirdparty/zstd/compress/zstd_compress_sequences.c
+++ b/thirdparty/zstd/compress/zstd_compress_sequences.c
@@ -275,10 +275,11 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity,
assert(nbSeq_1 > 1);
assert(entropyWorkspaceSize >= sizeof(ZSTD_BuildCTableWksp));
(void)entropyWorkspaceSize;
- FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "");
- { size_t const NCountSize = FSE_writeNCount(op, oend - op, wksp->norm, max, tableLog); /* overflow protected */
+ FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, nbSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed");
+ assert(oend >= op);
+ { size_t const NCountSize = FSE_writeNCount(op, (size_t)(oend - op), wksp->norm, max, tableLog); /* overflow protected */
FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
- FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "");
+ FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp failed");
return NCountSize;
}
}
@@ -312,19 +313,19 @@ ZSTD_encodeSequences_body(
FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]);
BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
if (MEM_32bits()) BIT_flushBits(&blockStream);
- BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
+ BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]);
if (MEM_32bits()) BIT_flushBits(&blockStream);
if (longOffsets) {
U32 const ofBits = ofCodeTable[nbSeq-1];
unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
if (extraBits) {
- BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits);
+ BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBits);
BIT_flushBits(&blockStream);
}
- BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits,
+ BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits,
ofBits - extraBits);
} else {
- BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
+ BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]);
}
BIT_flushBits(&blockStream);
@@ -338,8 +339,8 @@ ZSTD_encodeSequences_body(
U32 const mlBits = ML_bits[mlCode];
DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
(unsigned)sequences[n].litLength,
- (unsigned)sequences[n].matchLength + MINMATCH,
- (unsigned)sequences[n].offset);
+ (unsigned)sequences[n].mlBase + MINMATCH,
+ (unsigned)sequences[n].offBase);
/* 32b*/ /* 64b*/
/* (7)*/ /* (7)*/
FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); /* 15 */ /* 15 */
@@ -350,18 +351,18 @@ ZSTD_encodeSequences_body(
BIT_flushBits(&blockStream); /* (7)*/
BIT_addBits(&blockStream, sequences[n].litLength, llBits);
if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
- BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+ BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
if (longOffsets) {
unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
if (extraBits) {
- BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+ BIT_addBits(&blockStream, sequences[n].offBase, extraBits);
BIT_flushBits(&blockStream); /* (7)*/
}
- BIT_addBits(&blockStream, sequences[n].offset >> extraBits,
+ BIT_addBits(&blockStream, sequences[n].offBase >> extraBits,
ofBits - extraBits); /* 31 */
} else {
- BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */
+ BIT_addBits(&blockStream, sequences[n].offBase, ofBits); /* 31 */
}
BIT_flushBits(&blockStream); /* (7)*/
DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr));
@@ -398,7 +399,7 @@ ZSTD_encodeSequences_default(
#if DYNAMIC_BMI2
-static TARGET_ATTRIBUTE("bmi2") size_t
+static BMI2_TARGET_ATTRIBUTE size_t
ZSTD_encodeSequences_bmi2(
void* dst, size_t dstCapacity,
FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
diff --git a/thirdparty/zstd/compress/zstd_compress_superblock.c b/thirdparty/zstd/compress/zstd_compress_superblock.c
index e4e45069bc..10e3378577 100644
--- a/thirdparty/zstd/compress/zstd_compress_superblock.c
+++ b/thirdparty/zstd/compress/zstd_compress_superblock.c
@@ -132,6 +132,7 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
const seqDef* sp = sstart;
size_t matchLengthSum = 0;
size_t litLengthSum = 0;
+ (void)(litLengthSum); /* suppress unused variable warning on some environments */
while (send-sp > 0) {
ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
litLengthSum += seqLen.litLength;
@@ -324,7 +325,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit
static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
const BYTE* codeTable, unsigned maxCode,
size_t nbSeq, const FSE_CTable* fseCTable,
- const U32* additionalBits,
+ const U8* additionalBits,
short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
void* workspace, size_t wkspSize)
{
@@ -474,7 +475,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
/* I think there is an optimization opportunity here.
* Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
* since it recalculates estimate from scratch.
- * For example, it would recount literal distribution and symbol codes everytime.
+ * For example, it would recount literal distribution and symbol codes every time.
*/
cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
&nextCBlock->entropy, entropyMetadata,
@@ -538,7 +539,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
repcodes_t rep;
ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
for (seq = sstart; seq < sp; ++seq) {
- rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+ ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
}
ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
}
diff --git a/thirdparty/zstd/compress/zstd_cwksp.h b/thirdparty/zstd/compress/zstd_cwksp.h
index 2656d26ca2..dc3f40c80c 100644
--- a/thirdparty/zstd/compress/zstd_cwksp.h
+++ b/thirdparty/zstd/compress/zstd_cwksp.h
@@ -219,7 +219,7 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
/* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
* to align the beginning of tables section, as well as another n_2=[0, 63] bytes
- * to align the beginning of the aligned secion.
+ * to align the beginning of the aligned section.
*
* n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
* aligneds being sized in multiples of 64 bytes.
@@ -243,12 +243,14 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
/**
* Internal function. Do not use directly.
- * Reserves the given number of bytes within the aligned/buffer segment of the wksp, which
- * counts from the end of the wksp. (as opposed to the object/table segment)
+ * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
+ * which counts from the end of the wksp (as opposed to the object/table segment).
*
* Returns a pointer to the beginning of that space.
*/
-MEM_STATIC void* ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) {
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes)
+{
void* const alloc = (BYTE*)ws->allocStart - bytes;
void* const bottom = ws->tableEnd;
DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
@@ -260,6 +262,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t
ws->allocFailed = 1;
return NULL;
}
+ /* the area is reserved from the end of wksp.
+ * If it overlaps with tableValidEnd, it voids guarantees on values' range */
if (alloc < ws->tableValidEnd) {
ws->tableValidEnd = alloc;
}
@@ -269,10 +273,12 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t
/**
* Moves the cwksp to the next phase, and does any necessary allocations.
+ * cwksp initialization must necessarily go through each phase in order.
* Returns a 0 on success, or zstd error
*/
-MEM_STATIC size_t ZSTD_cwksp_internal_advance_phase(
- ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) {
+MEM_STATIC size_t
+ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase)
+{
assert(phase >= ws->phase);
if (phase > ws->phase) {
/* Going from allocating objects to allocating buffers */
@@ -295,15 +301,15 @@ MEM_STATIC size_t ZSTD_cwksp_internal_advance_phase(
{ /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
void* const alloc = ws->objectEnd;
size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
- void* const end = (BYTE*)alloc + bytesToAlign;
+ void* const objectEnd = (BYTE*)alloc + bytesToAlign;
DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
- RETURN_ERROR_IF(end > ws->workspaceEnd, memory_allocation,
+ RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
"table phase - alignment initial allocation failed!");
- ws->objectEnd = end;
- ws->tableEnd = end;
- ws->tableValidEnd = end;
- }
- }
+ ws->objectEnd = objectEnd;
+ ws->tableEnd = objectEnd; /* table area starts being empty */
+ if (ws->tableValidEnd < ws->tableEnd) {
+ ws->tableValidEnd = ws->tableEnd;
+ } } }
ws->phase = phase;
ZSTD_cwksp_assert_internal_consistency(ws);
}
@@ -313,15 +319,17 @@ MEM_STATIC size_t ZSTD_cwksp_internal_advance_phase(
/**
* Returns whether this object/buffer/etc was allocated in this workspace.
*/
-MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) {
+MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
+{
return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
}
/**
* Internal function. Do not use directly.
*/
-MEM_STATIC void* ZSTD_cwksp_reserve_internal(
- ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) {
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase)
+{
void* alloc;
if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || bytes == 0) {
return NULL;
@@ -351,14 +359,16 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal(
/**
* Reserves and returns unaligned memory.
*/
-MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) {
+MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+{
return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
}
/**
* Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
*/
-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+{
void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES),
ZSTD_cwksp_alloc_aligned);
assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
@@ -370,7 +380,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
* their values remain constrained, allowing us to re-use them without
* memset()-ing them.
*/
-MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
+MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+{
const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
void* alloc;
void* end;
@@ -408,9 +419,11 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
/**
* Aligned on sizeof(void*).
+ * Note : should happen only once, at workspace first initialization
*/
-MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
- size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
+MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes)
+{
+ size_t const roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
void* alloc = ws->objectEnd;
void* end = (BYTE*)alloc + roundedBytes;
@@ -419,15 +432,15 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
#endif
- DEBUGLOG(5,
+ DEBUGLOG(4,
"cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",
alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes);
- assert(((size_t)alloc & (sizeof(void*)-1)) == 0);
- assert((bytes & (sizeof(void*)-1)) == 0);
+ assert((size_t)alloc % ZSTD_ALIGNOF(void*) == 0);
+ assert(bytes % ZSTD_ALIGNOF(void*) == 0);
ZSTD_cwksp_assert_internal_consistency(ws);
/* we must be in the first phase, no advance is possible */
if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) {
- DEBUGLOG(4, "cwksp: object alloc failed!");
+ DEBUGLOG(3, "cwksp: object alloc failed!");
ws->allocFailed = 1;
return NULL;
}
@@ -438,7 +451,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
#if ZSTD_ADDRESS_SANITIZER && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
/* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
* either size. */
- alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+ alloc = (BYTE*)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
if (ws->isStatic == ZSTD_cwksp_dynamic_alloc) {
__asan_unpoison_memory_region(alloc, bytes);
}
@@ -447,7 +460,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
return alloc;
}
-MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) {
+MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
+{
DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
#if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
diff --git a/thirdparty/zstd/compress/zstd_double_fast.c b/thirdparty/zstd/compress/zstd_double_fast.c
index d0d3a784dd..76933dea26 100644
--- a/thirdparty/zstd/compress/zstd_double_fast.c
+++ b/thirdparty/zstd/compress/zstd_double_fast.c
@@ -48,10 +48,216 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_doubleFast_generic(
+size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ void const* src, size_t srcSize, U32 const mls /* template */)
+{
+ ZSTD_compressionParameters const* cParams = &ms->cParams;
+ U32* const hashLong = ms->hashTable;
+ const U32 hBitsL = cParams->hashLog;
+ U32* const hashSmall = ms->chainTable;
+ const U32 hBitsS = cParams->chainLog;
+ const BYTE* const base = ms->window.base;
+ const BYTE* const istart = (const BYTE*)src;
+ const BYTE* anchor = istart;
+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+ /* presumes that, if there is a dictionary, it must be using Attach mode */
+ const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+ const BYTE* const prefixLowest = base + prefixLowestIndex;
+ const BYTE* const iend = istart + srcSize;
+ const BYTE* const ilimit = iend - HASH_READ_SIZE;
+ U32 offset_1=rep[0], offset_2=rep[1];
+ U32 offsetSaved = 0;
+
+ size_t mLength;
+ U32 offset;
+ U32 curr;
+
+ /* how many positions to search before increasing step size */
+ const size_t kStepIncr = 1 << kSearchStrength;
+ /* the position at which to increment the step size if no match is found */
+ const BYTE* nextStep;
+ size_t step; /* the current step size */
+
+ size_t hl0; /* the long hash at ip */
+ size_t hl1; /* the long hash at ip1 */
+
+ U32 idxl0; /* the long match index for ip */
+ U32 idxl1; /* the long match index for ip1 */
+
+ const BYTE* matchl0; /* the long match for ip */
+ const BYTE* matchs0; /* the short match for ip */
+ const BYTE* matchl1; /* the long match for ip1 */
+
+ const BYTE* ip = istart; /* the current position */
+ const BYTE* ip1; /* the next position */
+
+ DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
+
+ /* init */
+ ip += ((ip - prefixLowest) == 0);
+ {
+ U32 const current = (U32)(ip - base);
+ U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+ U32 const maxRep = current - windowLow;
+ if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+ if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+ }
+
+ /* Outer Loop: one iteration per match found and stored */
+ while (1) {
+ step = 1;
+ nextStep = ip + kStepIncr;
+ ip1 = ip + step;
+
+ if (ip1 > ilimit) {
+ goto _cleanup;
+ }
+
+ hl0 = ZSTD_hashPtr(ip, hBitsL, 8);
+ idxl0 = hashLong[hl0];
+ matchl0 = base + idxl0;
+
+ /* Inner Loop: one iteration per search / position */
+ do {
+ const size_t hs0 = ZSTD_hashPtr(ip, hBitsS, mls);
+ const U32 idxs0 = hashSmall[hs0];
+ curr = (U32)(ip-base);
+ matchs0 = base + idxs0;
+
+ hashLong[hl0] = hashSmall[hs0] = curr; /* update hash tables */
+
+ /* check noDict repcode */
+ if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
+ mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+ ip++;
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+ goto _match_stored;
+ }
+
+ hl1 = ZSTD_hashPtr(ip1, hBitsL, 8);
+
+ if (idxl0 > prefixLowestIndex) {
+ /* check prefix long match */
+ if (MEM_read64(matchl0) == MEM_read64(ip)) {
+ mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8;
+ offset = (U32)(ip-matchl0);
+ while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+ goto _match_found;
+ }
+ }
+
+ idxl1 = hashLong[hl1];
+ matchl1 = base + idxl1;
+
+ if (idxs0 > prefixLowestIndex) {
+ /* check prefix short match */
+ if (MEM_read32(matchs0) == MEM_read32(ip)) {
+ goto _search_next_long;
+ }
+ }
+
+ if (ip1 >= nextStep) {
+ PREFETCH_L1(ip1 + 64);
+ PREFETCH_L1(ip1 + 128);
+ step++;
+ nextStep += kStepIncr;
+ }
+ ip = ip1;
+ ip1 += step;
+
+ hl0 = hl1;
+ idxl0 = idxl1;
+ matchl0 = matchl1;
+ #if defined(__aarch64__)
+ PREFETCH_L1(ip+256);
+ #endif
+ } while (ip1 <= ilimit);
+
+_cleanup:
+ /* save reps for next block */
+ rep[0] = offset_1 ? offset_1 : offsetSaved;
+ rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+ /* Return the last literals size */
+ return (size_t)(iend - anchor);
+
+_search_next_long:
+
+ /* check prefix long +1 match */
+ if (idxl1 > prefixLowestIndex) {
+ if (MEM_read64(matchl1) == MEM_read64(ip1)) {
+ ip = ip1;
+ mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8;
+ offset = (U32)(ip-matchl1);
+ while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */
+ goto _match_found;
+ }
+ }
+
+ /* if no long +1 match, explore the short match we found */
+ mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4;
+ offset = (U32)(ip - matchs0);
+ while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */
+
+ /* fall-through */
+
+_match_found: /* requires ip, offset, mLength */
+ offset_2 = offset_1;
+ offset_1 = offset;
+
+ if (step < 4) {
+ /* It is unsafe to write this value back to the hashtable when ip1 is
+ * greater than or equal to the new ip we will have after we're done
+ * processing this match. Rather than perform that test directly
+ * (ip1 >= ip + mLength), which costs speed in practice, we do a simpler
+ * more predictable test. The minmatch even if we take a short match is
+ * 4 bytes, so as long as step, the distance between ip and ip1
+ * (initially) is less than 4, we know ip1 < new ip. */
+ hashLong[hl1] = (U32)(ip1 - base);
+ }
+
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+
+_match_stored:
+ /* match found */
+ ip += mLength;
+ anchor = ip;
+
+ if (ip <= ilimit) {
+ /* Complementary insertion */
+ /* done after iLimit test, as candidates could be > iend-8 */
+ { U32 const indexToInsert = curr+2;
+ hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+ hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+ hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+ hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+ }
+
+ /* check immediate repcode */
+ while ( (ip <= ilimit)
+ && ( (offset_2>0)
+ & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+ /* store sequence */
+ size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+ U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
+ ip += rLength;
+ anchor = ip;
+ continue; /* faster when present ... (?) */
+ }
+ }
+ }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize,
- U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
+ U32 const mls /* template */)
{
ZSTD_compressionParameters const* cParams = &ms->cParams;
U32* const hashLong = ms->hashTable;
@@ -72,54 +278,30 @@ size_t ZSTD_compressBlock_doubleFast_generic(
U32 offsetSaved = 0;
const ZSTD_matchState_t* const dms = ms->dictMatchState;
- const ZSTD_compressionParameters* const dictCParams =
- dictMode == ZSTD_dictMatchState ?
- &dms->cParams : NULL;
- const U32* const dictHashLong = dictMode == ZSTD_dictMatchState ?
- dms->hashTable : NULL;
- const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ?
- dms->chainTable : NULL;
- const U32 dictStartIndex = dictMode == ZSTD_dictMatchState ?
- dms->window.dictLimit : 0;
- const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ?
- dms->window.base : NULL;
- const BYTE* const dictStart = dictMode == ZSTD_dictMatchState ?
- dictBase + dictStartIndex : NULL;
- const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ?
- dms->window.nextSrc : NULL;
- const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ?
- prefixLowestIndex - (U32)(dictEnd - dictBase) :
- 0;
- const U32 dictHBitsL = dictMode == ZSTD_dictMatchState ?
- dictCParams->hashLog : hBitsL;
- const U32 dictHBitsS = dictMode == ZSTD_dictMatchState ?
- dictCParams->chainLog : hBitsS;
+ const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
+ const U32* const dictHashLong = dms->hashTable;
+ const U32* const dictHashSmall = dms->chainTable;
+ const U32 dictStartIndex = dms->window.dictLimit;
+ const BYTE* const dictBase = dms->window.base;
+ const BYTE* const dictStart = dictBase + dictStartIndex;
+ const BYTE* const dictEnd = dms->window.nextSrc;
+ const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase);
+ const U32 dictHBitsL = dictCParams->hashLog;
+ const U32 dictHBitsS = dictCParams->chainLog;
const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
- DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic");
-
- assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+ DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
/* if a dictionary is attached, it must be within window range */
- if (dictMode == ZSTD_dictMatchState) {
- assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
- }
+ assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
/* init */
ip += (dictAndPrefixLength == 0);
- if (dictMode == ZSTD_noDict) {
- U32 const curr = (U32)(ip - base);
- U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
- U32 const maxRep = curr - windowLow;
- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
- }
- if (dictMode == ZSTD_dictMatchState) {
- /* dictMatchState repCode checks don't currently handle repCode == 0
- * disabling. */
- assert(offset_1 <= dictAndPrefixLength);
- assert(offset_2 <= dictAndPrefixLength);
- }
+
+ /* dictMatchState repCode checks don't currently handle repCode == 0
+ * disabling. */
+ assert(offset_1 <= dictAndPrefixLength);
+ assert(offset_2 <= dictAndPrefixLength);
/* Main Search Loop */
while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */
@@ -135,29 +317,18 @@ size_t ZSTD_compressBlock_doubleFast_generic(
const BYTE* matchLong = base + matchIndexL;
const BYTE* match = base + matchIndexS;
const U32 repIndex = curr + 1 - offset_1;
- const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
- && repIndex < prefixLowestIndex) ?
+ const BYTE* repMatch = (repIndex < prefixLowestIndex) ?
dictBase + (repIndex - dictIndexDelta) :
base + repIndex;
hashLong[h2] = hashSmall[h] = curr; /* update hash tables */
- /* check dictMatchState repcode */
- if (dictMode == ZSTD_dictMatchState
- && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+ /* check repcode */
+ if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
ip++;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
- goto _match_stored;
- }
-
- /* check noDict repcode */
- if ( dictMode == ZSTD_noDict
- && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
- mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
- ip++;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
goto _match_stored;
}
@@ -169,7 +340,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
goto _match_found;
}
- } else if (dictMode == ZSTD_dictMatchState) {
+ } else {
/* check dictMatchState long match */
U32 const dictMatchIndexL = dictHashLong[dictHL];
const BYTE* dictMatchL = dictBase + dictMatchIndexL;
@@ -187,7 +358,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
if (MEM_read32(match) == MEM_read32(ip)) {
goto _search_next_long;
}
- } else if (dictMode == ZSTD_dictMatchState) {
+ } else {
/* check dictMatchState short match */
U32 const dictMatchIndexS = dictHashSmall[dictHS];
match = dictBase + dictMatchIndexS;
@@ -220,7 +391,7 @@ _search_next_long:
while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
goto _match_found;
}
- } else if (dictMode == ZSTD_dictMatchState) {
+ } else {
/* check dict long +1 match */
U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
@@ -234,7 +405,7 @@ _search_next_long:
} } }
/* if no long +1 match, explore the short match we found */
- if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) {
+ if (matchIndexS < prefixLowestIndex) {
mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
offset = (U32)(curr - matchIndexS);
while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
@@ -244,13 +415,11 @@ _search_next_long:
while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
}
- /* fall-through */
-
_match_found:
offset_2 = offset_1;
offset_1 = offset;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
_match_stored:
/* match found */
@@ -268,43 +437,27 @@ _match_stored:
}
/* check immediate repcode */
- if (dictMode == ZSTD_dictMatchState) {
- while (ip <= ilimit) {
- U32 const current2 = (U32)(ip-base);
- U32 const repIndex2 = current2 - offset_2;
- const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState
- && repIndex2 < prefixLowestIndex ?
- dictBase + repIndex2 - dictIndexDelta :
- base + repIndex2;
- if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
- const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
- U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
- hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
- hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
- ip += repLength2;
- anchor = ip;
- continue;
- }
- break;
- } }
-
- if (dictMode == ZSTD_noDict) {
- while ( (ip <= ilimit)
- && ( (offset_2>0)
- & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
- /* store sequence */
- size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
- U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */
- hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
- hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH);
- ip += rLength;
+ while (ip <= ilimit) {
+ U32 const current2 = (U32)(ip-base);
+ U32 const repIndex2 = current2 - offset_2;
+ const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ?
+ dictBase + repIndex2 - dictIndexDelta :
+ base + repIndex2;
+ if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+ && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+ const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+ size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+ U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+ hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+ hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+ ip += repLength2;
anchor = ip;
- continue; /* faster when present ... (?) */
- } } }
+ continue;
+ }
+ break;
+ }
+ }
} /* while (ip < ilimit) */
/* save reps for next block */
@@ -315,6 +468,24 @@ _match_stored:
return (size_t)(iend - anchor);
}
+#define ZSTD_GEN_DFAST_FN(dictMode, mls) \
+ static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
+ void const* src, size_t srcSize) \
+ { \
+ return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \
+ }
+
+ZSTD_GEN_DFAST_FN(noDict, 4)
+ZSTD_GEN_DFAST_FN(noDict, 5)
+ZSTD_GEN_DFAST_FN(noDict, 6)
+ZSTD_GEN_DFAST_FN(noDict, 7)
+
+ZSTD_GEN_DFAST_FN(dictMatchState, 4)
+ZSTD_GEN_DFAST_FN(dictMatchState, 5)
+ZSTD_GEN_DFAST_FN(dictMatchState, 6)
+ZSTD_GEN_DFAST_FN(dictMatchState, 7)
+
size_t ZSTD_compressBlock_doubleFast(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -325,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast(
{
default: /* includes case 3 */
case 4 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
+ return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, src, srcSize);
case 5 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
+ return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, src, srcSize);
case 6 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
+ return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, src, srcSize);
case 7 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
+ return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, src, srcSize);
}
}
@@ -345,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
{
default: /* includes case 3 */
case 4 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
+ return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore, rep, src, srcSize);
case 5 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
+ return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore, rep, src, srcSize);
case 6 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
+ return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore, rep, src, srcSize);
case 7 :
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
+ return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore, rep, src, srcSize);
}
}
@@ -387,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
/* if extDict is invalidated due to maxDistance, switch to "regular" variant */
if (prefixStartIndex == dictStartIndex)
- return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict);
+ return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSize);
/* Search Loop */
while (ip < ilimit) { /* < instead of <=, because (ip+1) */
@@ -409,12 +580,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */
if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
- & (offset_1 < curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
+ & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */
&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
ip++;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
} else {
if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
@@ -425,7 +596,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
offset_2 = offset_1;
offset_1 = offset;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
} else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
@@ -450,7 +621,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
}
offset_2 = offset_1;
offset_1 = offset;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
} else {
ip += ((ip-anchor) >> kSearchStrength) + 1;
@@ -477,12 +648,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
U32 const repIndex2 = current2 - offset_2;
const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
- & (offset_2 < current2 - dictStartIndex))
+ & (offset_2 <= current2 - dictStartIndex))
&& (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
ip += repLength2;
@@ -500,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
return (size_t)(iend - anchor);
}
+ZSTD_GEN_DFAST_FN(extDict, 4)
+ZSTD_GEN_DFAST_FN(extDict, 5)
+ZSTD_GEN_DFAST_FN(extDict, 6)
+ZSTD_GEN_DFAST_FN(extDict, 7)
size_t ZSTD_compressBlock_doubleFast_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -510,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
{
default: /* includes case 3 */
case 4 :
- return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
+ return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, src, srcSize);
case 5 :
- return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
+ return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, src, srcSize);
case 6 :
- return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
+ return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, src, srcSize);
case 7 :
- return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
+ return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
}
}
diff --git a/thirdparty/zstd/compress/zstd_fast.c b/thirdparty/zstd/compress/zstd_fast.c
index 4edc04dccd..802fc31579 100644
--- a/thirdparty/zstd/compress/zstd_fast.c
+++ b/thirdparty/zstd/compress/zstd_fast.c
@@ -43,145 +43,294 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
}
+/**
+ * If you squint hard enough (and ignore repcodes), the search operation at any
+ * given position is broken into 4 stages:
+ *
+ * 1. Hash (map position to hash value via input read)
+ * 2. Lookup (map hash val to index via hashtable read)
+ * 3. Load (map index to value at that position via input read)
+ * 4. Compare
+ *
+ * Each of these steps involves a memory read at an address which is computed
+ * from the previous step. This means these steps must be sequenced and their
+ * latencies are cumulative.
+ *
+ * Rather than do 1->2->3->4 sequentially for a single position before moving
+ * onto the next, this implementation interleaves these operations across the
+ * next few positions:
+ *
+ * R = Repcode Read & Compare
+ * H = Hash
+ * T = Table Lookup
+ * M = Match Read & Compare
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N | ... M
+ * N+1 | ... TM
+ * N+2 | R H T M
+ * N+3 | H TM
+ * N+4 | R H T M
+ * N+5 | H ...
+ * N+6 | R ...
+ *
+ * This is very much analogous to the pipelining of execution in a CPU. And just
+ * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
+ * branch).
+ *
+ * When this happens, we throw away our current state, and do the following prep
+ * to re-enter the loop:
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N | H T
+ * N+1 | H
+ *
+ * This is also the work we do at the beginning to enter the loop initially.
+ */
FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_fast_generic(
+ZSTD_compressBlock_fast_noDict_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize,
- U32 const mls)
+ U32 const mls, U32 const hasStep)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
U32* const hashTable = ms->hashTable;
U32 const hlog = cParams->hashLog;
/* support stepSize of 0 */
- size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+ size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2;
const BYTE* const base = ms->window.base;
const BYTE* const istart = (const BYTE*)src;
- /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */
- const BYTE* ip0 = istart;
- const BYTE* ip1;
- const BYTE* anchor = istart;
const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
const BYTE* const prefixStart = base + prefixStartIndex;
const BYTE* const iend = istart + srcSize;
const BYTE* const ilimit = iend - HASH_READ_SIZE;
- U32 offset_1=rep[0], offset_2=rep[1];
+
+ const BYTE* anchor = istart;
+ const BYTE* ip0 = istart;
+ const BYTE* ip1;
+ const BYTE* ip2;
+ const BYTE* ip3;
+ U32 current0;
+
+ U32 rep_offset1 = rep[0];
+ U32 rep_offset2 = rep[1];
U32 offsetSaved = 0;
- /* init */
+ size_t hash0; /* hash for ip0 */
+ size_t hash1; /* hash for ip1 */
+ U32 idx; /* match idx for ip0 */
+ U32 mval; /* src value at match idx */
+
+ U32 offcode;
+ const BYTE* match0;
+ size_t mLength;
+
+ /* ip0 and ip1 are always adjacent. The targetLength skipping and
+ * uncompressibility acceleration is applied to every other position,
+ * matching the behavior of #1562. step therefore represents the gap
+ * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
+ size_t step;
+ const BYTE* nextStep;
+ const size_t kStepIncr = (1 << (kSearchStrength - 1));
+
DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
ip0 += (ip0 == prefixStart);
- ip1 = ip0 + 1;
{ U32 const curr = (U32)(ip0 - base);
U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
U32 const maxRep = curr - windowLow;
- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+ if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
+ if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
}
- /* Main Search Loop */
-#ifdef __INTEL_COMPILER
- /* From intel 'The vector pragma indicates that the loop should be
- * vectorized if it is legal to do so'. Can be used together with
- * #pragma ivdep (but have opted to exclude that because intel
- * warns against using it).*/
- #pragma vector always
-#endif
- while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */
- size_t mLength;
- BYTE const* ip2 = ip0 + 2;
- size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls);
- U32 const val0 = MEM_read32(ip0);
- size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls);
- U32 const val1 = MEM_read32(ip1);
- U32 const current0 = (U32)(ip0-base);
- U32 const current1 = (U32)(ip1-base);
- U32 const matchIndex0 = hashTable[h0];
- U32 const matchIndex1 = hashTable[h1];
- BYTE const* repMatch = ip2 - offset_1;
- const BYTE* match0 = base + matchIndex0;
- const BYTE* match1 = base + matchIndex1;
- U32 offcode;
-
-#if defined(__aarch64__)
- PREFETCH_L1(ip0+256);
-#endif
-
- hashTable[h0] = current0; /* update hash table */
- hashTable[h1] = current1; /* update hash table */
-
- assert(ip0 + 1 == ip1);
-
- if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) {
- mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0;
- ip0 = ip2 - mLength;
- match0 = repMatch - mLength;
+ /* start each op */
+_start: /* Requires: ip0 */
+
+ step = stepSize;
+ nextStep = ip0 + kStepIncr;
+
+ /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+ ip1 = ip0 + 1;
+ ip2 = ip0 + step;
+ ip3 = ip2 + 1;
+
+ if (ip3 >= ilimit) {
+ goto _cleanup;
+ }
+
+ hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+ hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+ idx = hashTable[hash0];
+
+ do {
+ /* load repcode match for ip[2]*/
+ const U32 rval = MEM_read32(ip2 - rep_offset1);
+
+ /* write back hash table entry */
+ current0 = (U32)(ip0 - base);
+ hashTable[hash0] = current0;
+
+ /* check repcode at ip[2] */
+ if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
+ ip0 = ip2;
+ match0 = ip0 - rep_offset1;
+ mLength = ip0[-1] == match0[-1];
+ ip0 -= mLength;
+ match0 -= mLength;
+ offcode = STORE_REPCODE_1;
mLength += 4;
- offcode = 0;
goto _match;
}
- if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) {
- /* found a regular match */
- goto _offset;
+
+ /* load match for ip[0] */
+ if (idx >= prefixStartIndex) {
+ mval = MEM_read32(base + idx);
+ } else {
+ mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
}
- if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) {
- /* found a regular match after one literal */
- ip0 = ip1;
- match0 = match1;
+
+ /* check match at ip[0] */
+ if (MEM_read32(ip0) == mval) {
+ /* found a match! */
goto _offset;
}
- { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize;
- assert(step >= 2);
- ip0 += step;
- ip1 += step;
- continue;
+
+ /* lookup ip[1] */
+ idx = hashTable[hash1];
+
+ /* hash ip[2] */
+ hash0 = hash1;
+ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+ /* advance to next positions */
+ ip0 = ip1;
+ ip1 = ip2;
+ ip2 = ip3;
+
+ /* write back hash table entry */
+ current0 = (U32)(ip0 - base);
+ hashTable[hash0] = current0;
+
+ /* load match for ip[0] */
+ if (idx >= prefixStartIndex) {
+ mval = MEM_read32(base + idx);
+ } else {
+ mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
}
-_offset: /* Requires: ip0, match0 */
- /* Compute the offset code */
- offset_2 = offset_1;
- offset_1 = (U32)(ip0-match0);
- offcode = offset_1 + ZSTD_REP_MOVE;
- mLength = 4;
- /* Count the backwards match length */
- while (((ip0>anchor) & (match0>prefixStart))
- && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */
-_match: /* Requires: ip0, match0, offcode */
- /* Count the forward length */
- mLength += ZSTD_count(ip0+mLength, match0+mLength, iend);
- ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH);
- /* match found */
- ip0 += mLength;
- anchor = ip0;
+ /* check match at ip[0] */
+ if (MEM_read32(ip0) == mval) {
+ /* found a match! */
+ goto _offset;
+ }
- if (ip0 <= ilimit) {
- /* Fill Table */
- assert(base+current0+2 > istart); /* check base overflow */
- hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
- hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
-
- if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */
- while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) {
- /* store sequence */
- size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4;
- { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
- hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
- ip0 += rLength;
- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH);
- anchor = ip0;
- continue; /* faster when present (confirmed on gcc-8) ... (?) */
- } } }
- ip1 = ip0 + 1;
- }
+ /* lookup ip[1] */
+ idx = hashTable[hash1];
+
+ /* hash ip[2] */
+ hash0 = hash1;
+ hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+ /* advance to next positions */
+ ip0 = ip1;
+ ip1 = ip2;
+ ip2 = ip0 + step;
+ ip3 = ip1 + step;
+
+ /* calculate step */
+ if (ip2 >= nextStep) {
+ step++;
+ PREFETCH_L1(ip1 + 64);
+ PREFETCH_L1(ip1 + 128);
+ nextStep += kStepIncr;
+ }
+ } while (ip3 < ilimit);
+
+_cleanup:
+ /* Note that there are probably still a couple positions we could search.
+ * However, it seems to be a meaningful performance hit to try to search
+ * them. So let's not. */
/* save reps for next block */
- rep[0] = offset_1 ? offset_1 : offsetSaved;
- rep[1] = offset_2 ? offset_2 : offsetSaved;
+ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
+ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
/* Return the last literals size */
return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx */
+
+ /* Compute the offset code. */
+ match0 = base + idx;
+ rep_offset2 = rep_offset1;
+ rep_offset1 = (U32)(ip0-match0);
+ offcode = STORE_OFFSET(rep_offset1);
+ mLength = 4;
+
+ /* Count the backwards match length. */
+ while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
+ ip0--;
+ match0--;
+ mLength++;
+ }
+
+_match: /* Requires: ip0, match0, offcode */
+
+ /* Count the forward length. */
+ mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
+
+ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+ ip0 += mLength;
+ anchor = ip0;
+
+ /* write next hash table entry */
+ if (ip1 < ip0) {
+ hashTable[hash1] = (U32)(ip1 - base);
+ }
+
+ /* Fill table and check for immediate repcode. */
+ if (ip0 <= ilimit) {
+ /* Fill Table */
+ assert(base+current0+2 > istart); /* check base overflow */
+ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
+ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+ if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
+ while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
+ /* store sequence */
+ size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
+ { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+ ip0 += rLength;
+ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
+ anchor = ip0;
+ continue; /* faster when present (confirmed on gcc-8) ... (?) */
+ } } }
+
+ goto _start;
}
+#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \
+ static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
+ void const* src, size_t srcSize) \
+ { \
+ return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
+ }
+
+ZSTD_GEN_FAST_FN(noDict, 4, 1)
+ZSTD_GEN_FAST_FN(noDict, 5, 1)
+ZSTD_GEN_FAST_FN(noDict, 6, 1)
+ZSTD_GEN_FAST_FN(noDict, 7, 1)
+
+ZSTD_GEN_FAST_FN(noDict, 4, 0)
+ZSTD_GEN_FAST_FN(noDict, 5, 0)
+ZSTD_GEN_FAST_FN(noDict, 6, 0)
+ZSTD_GEN_FAST_FN(noDict, 7, 0)
size_t ZSTD_compressBlock_fast(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -189,24 +338,40 @@ size_t ZSTD_compressBlock_fast(
{
U32 const mls = ms->cParams.minMatch;
assert(ms->dictMatchState == NULL);
- switch(mls)
- {
- default: /* includes case 3 */
- case 4 :
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4);
- case 5 :
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5);
- case 6 :
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6);
- case 7 :
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7);
+ if (ms->cParams.targetLength > 1) {
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
+ case 5 :
+ return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
+ case 6 :
+ return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
+ case 7 :
+ return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
+ }
+ } else {
+ switch(mls)
+ {
+ default: /* includes case 3 */
+ case 4 :
+ return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
+ case 5 :
+ return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
+ case 6 :
+ return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
+ case 7 :
+ return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
+ }
+
}
}
FORCE_INLINE_TEMPLATE
size_t ZSTD_compressBlock_fast_dictMatchState_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
- void const* src, size_t srcSize, U32 const mls)
+ void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
U32* const hashTable = ms->hashTable;
@@ -242,6 +407,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
assert(endIndex - prefixStartIndex <= maxDistance);
(void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
+ (void)hasStep; /* not currently specialized on whether it's accelerated */
+
/* ensure there will be no underflow
* when translating a dict index into a local index */
assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
@@ -272,7 +439,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
ip++;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
} else if ( (matchIndex <= prefixStartIndex) ) {
size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
U32 const dictMatchIndex = dictHashTable[dictHash];
@@ -292,7 +459,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
} /* catch up */
offset_2 = offset_1;
offset_1 = offset;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
}
} else if (MEM_read32(match) != MEM_read32(ip)) {
/* it's not a match, and we're not going to check the dictionary */
@@ -307,7 +474,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
&& (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
offset_2 = offset_1;
offset_1 = offset;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
}
/* match found */
@@ -332,7 +499,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
ip += repLength2;
anchor = ip;
@@ -351,6 +518,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
return (size_t)(iend - anchor);
}
+
+ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
+
size_t ZSTD_compressBlock_fast_dictMatchState(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize)
@@ -361,20 +534,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
{
default: /* includes case 3 */
case 4 :
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4);
+ return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
case 5 :
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5);
+ return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
case 6 :
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6);
+ return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
case 7 :
- return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7);
+ return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
}
}
static size_t ZSTD_compressBlock_fast_extDict_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
- void const* src, size_t srcSize, U32 const mls)
+ void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
U32* const hashTable = ms->hashTable;
@@ -398,11 +571,13 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
const BYTE* const ilimit = iend - 8;
U32 offset_1=rep[0], offset_2=rep[1];
+ (void)hasStep; /* not currently specialized on whether it's accelerated */
+
DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
/* switch to "regular" variant if extDict is invalidated due to maxDistance */
if (prefixStartIndex == dictStartIndex)
- return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls);
+ return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
/* Search Loop */
while (ip < ilimit) { /* < instead of <=, because (ip+1) */
@@ -418,12 +593,12 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
- & (offset_1 < curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
+ & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
ip++;
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
ip += rLength;
anchor = ip;
} else {
@@ -439,7 +614,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
offset_2 = offset_1; offset_1 = offset; /* update offset history */
- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
ip += mLength;
anchor = ip;
} }
@@ -453,12 +628,12 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
U32 const current2 = (U32)(ip-base);
U32 const repIndex2 = current2 - offset_2;
const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 < curr - dictStartIndex)) /* intentional overflow */
+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */
&& (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
{ U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH);
+ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
ip += repLength2;
anchor = ip;
@@ -475,6 +650,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
return (size_t)(iend - anchor);
}
+ZSTD_GEN_FAST_FN(extDict, 4, 0)
+ZSTD_GEN_FAST_FN(extDict, 5, 0)
+ZSTD_GEN_FAST_FN(extDict, 6, 0)
+ZSTD_GEN_FAST_FN(extDict, 7, 0)
size_t ZSTD_compressBlock_fast_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -485,12 +664,12 @@ size_t ZSTD_compressBlock_fast_extDict(
{
default: /* includes case 3 */
case 4 :
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
+ return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
case 5 :
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
+ return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
case 6 :
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
+ return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
case 7 :
- return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
+ return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
}
}
diff --git a/thirdparty/zstd/compress/zstd_lazy.c b/thirdparty/zstd/compress/zstd_lazy.c
index 3d523e8472..2e38dcb46d 100644
--- a/thirdparty/zstd/compress/zstd_lazy.c
+++ b/thirdparty/zstd/compress/zstd_lazy.c
@@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
* assumption : curr >= btlow == (curr - btmask)
* doesn't fail */
static void
-ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
+ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
U32 curr, const BYTE* inputEnd,
U32 nbCompares, U32 btLow,
const ZSTD_dictMode_e dictMode)
@@ -93,7 +93,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
assert(curr >= btLow);
assert(ip < iend); /* condition for ZSTD_count */
- while (nbCompares-- && (matchIndex > windowLow)) {
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
U32* const nextPtr = bt + 2*(matchIndex & btMask);
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
assert(matchIndex < curr);
@@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
static size_t
ZSTD_DUBT_findBetterDictMatch (
- ZSTD_matchState_t* ms,
+ const ZSTD_matchState_t* ms,
const BYTE* const ip, const BYTE* const iend,
size_t* offsetPtr,
size_t bestLength,
@@ -185,7 +185,7 @@ ZSTD_DUBT_findBetterDictMatch (
(void)dictMode;
assert(dictMode == ZSTD_dictMatchState);
- while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
+ for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
const BYTE* match = dictBase + dictMatchIndex;
@@ -197,8 +197,8 @@ ZSTD_DUBT_findBetterDictMatch (
U32 matchIndex = dictMatchIndex + dictIndexDelta;
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+ bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
}
if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
break; /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +218,7 @@ ZSTD_DUBT_findBetterDictMatch (
}
if (bestLength >= MINMATCH) {
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+ U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
}
@@ -309,7 +309,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
matchIndex = hashTable[h];
hashTable[h] = curr; /* Update Hash Table */
- while (nbCompares-- && (matchIndex > windowLow)) {
+ for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
U32* const nextPtr = bt + 2*(matchIndex & btMask);
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
const BYTE* match;
@@ -328,7 +328,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
if (matchLength > matchEndIdx - matchIndex)
matchEndIdx = matchIndex + (U32)matchLength;
if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
- bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + curr - matchIndex;
+ bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */
if (dictMode == ZSTD_dictMatchState) {
nbCompares = 0; /* in addition to avoiding checking any
@@ -357,6 +357,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
*smallerPtr = *largerPtr = 0;
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
if (dictMode == ZSTD_dictMatchState && nbCompares) {
bestLength = ZSTD_DUBT_findBetterDictMatch(
ms, ip, iend,
@@ -367,7 +368,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
if (bestLength >= MINMATCH) {
- U32 const mIndex = curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+ U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
}
@@ -390,54 +391,6 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
}
-
-static size_t
-ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
- case 7 :
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
- }
-}
-
-
-static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
- case 7 :
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
- }
-}
-
-
-static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
- case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
- case 7 :
- case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
- }
-}
-
/***********************************
* Dedicated dict search
***********************************/
@@ -450,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
U32* const chainTable = ms->chainTable;
U32 const chainSize = 1 << ms->cParams.chainLog;
U32 idx = ms->nextToUpdate;
- U32 const minChain = chainSize < target ? target - chainSize : idx;
+ U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
U32 const cacheSize = bucketSize - 1;
U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
@@ -464,7 +417,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B
U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
U32* const tmpHashTable = hashTable;
U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
- U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
+ U32 const tmpChainSize = (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
U32 hashIdx;
@@ -608,7 +561,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
/* save best solution */
if (currentMl > ml) {
ml = currentMl;
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
if (ip+currentMl == iLimit) {
/* best possible, avoids read overflow on next attempt */
return ml;
@@ -645,7 +598,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
/* save best solution */
if (currentMl > ml) {
ml = currentMl;
- *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
}
}
@@ -692,7 +645,7 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
/* inlining is important to hardwire a hot branch (template emulation) */
FORCE_INLINE_TEMPLATE
-size_t ZSTD_HcFindBestMatch_generic (
+size_t ZSTD_HcFindBestMatch(
ZSTD_matchState_t* ms,
const BYTE* const ip, const BYTE* const iLimit,
size_t* offsetPtr,
@@ -750,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic (
/* save best solution */
if (currentMl > ml) {
ml = currentMl;
- *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
+ *offsetPtr = STORE_OFFSET(curr - matchIndex);
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
}
@@ -758,6 +711,7 @@ size_t ZSTD_HcFindBestMatch_generic (
matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
}
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
if (dictMode == ZSTD_dedicatedDictSearch) {
ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
@@ -784,7 +738,8 @@ size_t ZSTD_HcFindBestMatch_generic (
/* save best solution */
if (currentMl > ml) {
ml = currentMl;
- *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+ assert(curr > matchIndex + dmsIndexDelta);
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
}
@@ -797,310 +752,80 @@ size_t ZSTD_HcFindBestMatch_generic (
return ml;
}
-
-FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
- case 7 :
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
- }
-}
-
-
-static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
- case 7 :
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
- }
-}
-
-
-static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dedicatedDictSearch);
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dedicatedDictSearch);
- case 7 :
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dedicatedDictSearch);
- }
-}
-
-
-FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
- case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
- case 7 :
- case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
- }
-}
-
/* *********************************
* (SIMD) Row-based matchfinder
***********************************/
/* Constants for row-based hash */
-#define ZSTD_ROW_HASH_TAG_OFFSET 1 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
+#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */
+#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */
#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
-typedef U32 ZSTD_VecMask; /* Clarifies when we are interacting with a U32 representing a mask of matches */
-
-#if !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) /* SIMD SSE version */
-
-#include <emmintrin.h>
-typedef __m128i ZSTD_Vec128;
-
-/* Returns a 128-bit container with 128-bits from src */
-static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
- return _mm_loadu_si128((ZSTD_Vec128 const*)src);
-}
-
-/* Returns a ZSTD_Vec128 with the byte "val" packed 16 times */
-static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
- return _mm_set1_epi8((char)val);
-}
-
-/* Do byte-by-byte comparison result of x and y. Then collapse 128-bit resultant mask
- * into a 32-bit mask that is the MSB of each byte.
- * */
-static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
- return (ZSTD_VecMask)_mm_movemask_epi8(_mm_cmpeq_epi8(x, y));
-}
-
-typedef struct {
- __m128i fst;
- __m128i snd;
-} ZSTD_Vec256;
-
-static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
- ZSTD_Vec256 v;
- v.fst = ZSTD_Vec128_read(ptr);
- v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
- return v;
-}
-
-static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
- ZSTD_Vec256 v;
- v.fst = ZSTD_Vec128_set8(val);
- v.snd = ZSTD_Vec128_set8(val);
- return v;
-}
-
-static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
- ZSTD_VecMask fstMask;
- ZSTD_VecMask sndMask;
- fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
- sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
- return fstMask | (sndMask << 16);
-}
-
-#elif !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON) /* SIMD ARM NEON Version */
-
-#include <arm_neon.h>
-typedef uint8x16_t ZSTD_Vec128;
-
-static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
- return vld1q_u8((const BYTE* const)src);
-}
-
-static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
- return vdupq_n_u8(val);
-}
-
-/* Mimics '_mm_movemask_epi8()' from SSE */
-static U32 ZSTD_vmovmaskq_u8(ZSTD_Vec128 val) {
- /* Shift out everything but the MSB bits in each byte */
- uint16x8_t highBits = vreinterpretq_u16_u8(vshrq_n_u8(val, 7));
- /* Merge the even lanes together with vsra (right shift and add) */
- uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(highBits, highBits, 7));
- uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
- uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
- /* Extract the low 8 bits from each lane, merge */
- return vgetq_lane_u8(paired64, 0) | ((U32)vgetq_lane_u8(paired64, 8) << 8);
-}
-
-static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
- return (ZSTD_VecMask)ZSTD_vmovmaskq_u8(vceqq_u8(x, y));
-}
-
-typedef struct {
- uint8x16_t fst;
- uint8x16_t snd;
-} ZSTD_Vec256;
-
-static ZSTD_Vec256 ZSTD_Vec256_read(const void* const ptr) {
- ZSTD_Vec256 v;
- v.fst = ZSTD_Vec128_read(ptr);
- v.snd = ZSTD_Vec128_read((ZSTD_Vec128 const*)ptr + 1);
- return v;
-}
-
-static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
- ZSTD_Vec256 v;
- v.fst = ZSTD_Vec128_set8(val);
- v.snd = ZSTD_Vec128_set8(val);
- return v;
-}
-
-static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
- ZSTD_VecMask fstMask;
- ZSTD_VecMask sndMask;
- fstMask = ZSTD_Vec128_cmpMask8(x.fst, y.fst);
- sndMask = ZSTD_Vec128_cmpMask8(x.snd, y.snd);
- return fstMask | (sndMask << 16);
-}
-
-#else /* Scalar fallback version */
-
-#define VEC128_NB_SIZE_T (16 / sizeof(size_t))
-typedef struct {
- size_t vec[VEC128_NB_SIZE_T];
-} ZSTD_Vec128;
-
-static ZSTD_Vec128 ZSTD_Vec128_read(const void* const src) {
- ZSTD_Vec128 ret;
- ZSTD_memcpy(ret.vec, src, VEC128_NB_SIZE_T*sizeof(size_t));
- return ret;
-}
-
-static ZSTD_Vec128 ZSTD_Vec128_set8(BYTE val) {
- ZSTD_Vec128 ret = { {0} };
- int startBit = sizeof(size_t) * 8 - 8;
- for (;startBit >= 0; startBit -= 8) {
- unsigned j = 0;
- for (;j < VEC128_NB_SIZE_T; ++j) {
- ret.vec[j] |= ((size_t)val << startBit);
- }
- }
- return ret;
-}
-
-/* Compare x to y, byte by byte, generating a "matches" bitfield */
-static ZSTD_VecMask ZSTD_Vec128_cmpMask8(ZSTD_Vec128 x, ZSTD_Vec128 y) {
- ZSTD_VecMask res = 0;
- unsigned i = 0;
- unsigned l = 0;
- for (; i < VEC128_NB_SIZE_T; ++i) {
- const size_t cmp1 = x.vec[i];
- const size_t cmp2 = y.vec[i];
- unsigned j = 0;
- for (; j < sizeof(size_t); ++j, ++l) {
- if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
- res |= ((U32)1 << (j+i*sizeof(size_t)));
- }
- }
- }
- return res;
-}
-
-#define VEC256_NB_SIZE_T 2*VEC128_NB_SIZE_T
-typedef struct {
- size_t vec[VEC256_NB_SIZE_T];
-} ZSTD_Vec256;
-
-static ZSTD_Vec256 ZSTD_Vec256_read(const void* const src) {
- ZSTD_Vec256 ret;
- ZSTD_memcpy(ret.vec, src, VEC256_NB_SIZE_T*sizeof(size_t));
- return ret;
-}
-
-static ZSTD_Vec256 ZSTD_Vec256_set8(BYTE val) {
- ZSTD_Vec256 ret = { {0} };
- int startBit = sizeof(size_t) * 8 - 8;
- for (;startBit >= 0; startBit -= 8) {
- unsigned j = 0;
- for (;j < VEC256_NB_SIZE_T; ++j) {
- ret.vec[j] |= ((size_t)val << startBit);
- }
- }
- return ret;
-}
-
-/* Compare x to y, byte by byte, generating a "matches" bitfield */
-static ZSTD_VecMask ZSTD_Vec256_cmpMask8(ZSTD_Vec256 x, ZSTD_Vec256 y) {
- ZSTD_VecMask res = 0;
- unsigned i = 0;
- unsigned l = 0;
- for (; i < VEC256_NB_SIZE_T; ++i) {
- const size_t cmp1 = x.vec[i];
- const size_t cmp2 = y.vec[i];
- unsigned j = 0;
- for (; j < sizeof(size_t); ++j, ++l) {
- if (((cmp1 >> j*8) & 0xFF) == ((cmp2 >> j*8) & 0xFF)) {
- res |= ((U32)1 << (j+i*sizeof(size_t)));
- }
- }
- }
- return res;
-}
-
-#endif /* !defined(ZSTD_NO_INTRINSICS) && defined(__SSE2__) */
+typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 representing a mask of matches */
/* ZSTD_VecMask_next():
* Starting from the LSB, returns the idx of the next non-zero bit.
* Basically counting the nb of trailing zeroes.
*/
static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
-# if defined(_MSC_VER) /* Visual */
- unsigned long r=0;
- return _BitScanForward(&r, val) ? (U32)r : 0;
-# elif defined(__GNUC__) && (__GNUC__ >= 3)
- return (U32)__builtin_ctz(val);
+ assert(val != 0);
+# if defined(_MSC_VER) && defined(_WIN64)
+ if (val != 0) {
+ unsigned long r;
+ _BitScanForward64(&r, val);
+ return (U32)(r);
+ } else {
+ /* Should not reach this code path */
+ __assume(0);
+ }
+# elif (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
+ if (sizeof(size_t) == 4) {
+ U32 mostSignificantWord = (U32)(val >> 32);
+ U32 leastSignificantWord = (U32)val;
+ if (leastSignificantWord == 0) {
+ return 32 + (U32)__builtin_ctz(mostSignificantWord);
+ } else {
+ return (U32)__builtin_ctz(leastSignificantWord);
+ }
+ } else {
+ return (U32)__builtin_ctzll(val);
+ }
# else
- /* Software ctz version: http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup */
- static const U32 multiplyDeBruijnBitPosition[32] =
- {
- 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
- 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
- };
- return multiplyDeBruijnBitPosition[((U32)((v & -(int)v) * 0x077CB531U)) >> 27];
+ /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
+ * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
+ */
+ val = ~val & (val - 1ULL); /* Lowest set bit mask */
+ val = val - ((val >> 1) & 0x5555555555555555);
+ val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
+ return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
# endif
}
-/* ZSTD_VecMask_rotateRight():
- * Rotates a bitfield to the right by "rotation" bits.
- * If the rotation is greater than totalBits, the returned mask is 0.
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
*/
-FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_VecMask_rotateRight(ZSTD_VecMask mask, U32 const rotation, U32 const totalBits) {
- if (rotation == 0)
- return mask;
- switch (totalBits) {
- default:
- assert(0);
- case 16:
- return (mask >> rotation) | (U16)(mask << (16 - rotation));
- case 32:
- return (mask >> rotation) | (U32)(mask << (32 - rotation));
- }
+FORCE_INLINE_TEMPLATE
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+ assert(count < 64);
+ count &= 0x3F; /* for fickle pattern recognition */
+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+ assert(count < 32);
+ count &= 0x1F; /* for fickle pattern recognition */
+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+FORCE_INLINE_TEMPLATE
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+ assert(count < 16);
+ count &= 0x0F; /* for fickle pattern recognition */
+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
}
/* ZSTD_row_nextIndex():
@@ -1126,20 +851,24 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
*/
FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
PREFETCH_L1(hashTable + relRow);
- if (rowLog == 5) {
+ if (rowLog >= 5) {
PREFETCH_L1(hashTable + relRow + 16);
+ /* Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows */
}
PREFETCH_L1(tagTable + relRow);
- assert(rowLog == 4 || rowLog == 5);
+ if (rowLog == 6) {
+ PREFETCH_L1(tagTable + relRow + 32);
+ }
+ assert(rowLog == 4 || rowLog == 5 || rowLog == 6);
assert(ZSTD_isAligned(hashTable + relRow, 64)); /* prefetched hash row always 64-byte aligned */
- assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on a multiple of 32 or 64 bytes */
+ assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* prefetched tagRow sits on correct multiple of bytes (32,64,128) */
}
/* ZSTD_row_fillHashCache():
* Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
* but not beyond iLimit.
*/
-static void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
U32 const rowLog, U32 const mls,
U32 idx, const BYTE* const iLimit)
{
@@ -1179,35 +908,65 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
}
}
-/* ZSTD_row_update_internal():
- * Inserts the byte at ip into the appropriate position in the hash table.
- * Determines the relative row, and the position within the {16, 32} entry row to insert at.
+/* ZSTD_row_update_internalImpl():
+ * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
*/
-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
- U32 const mls, U32 const rowLog,
- U32 const rowMask, U32 const useCache)
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+ U32 updateStartIdx, U32 const updateEndIdx,
+ U32 const mls, U32 const rowLog,
+ U32 const rowMask, U32 const useCache)
{
U32* const hashTable = ms->hashTable;
U16* const tagTable = ms->tagTable;
U32 const hashLog = ms->rowHashLog;
const BYTE* const base = ms->window.base;
- const U32 target = (U32)(ip - base);
- U32 idx = ms->nextToUpdate;
- DEBUGLOG(6, "ZSTD_row_update_internal(): nextToUpdate=%u, current=%u", idx, target);
- for (; idx < target; ++idx) {
- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, idx, hashLog, rowLog, mls)
- : (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+ DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
+ for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+ : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
U32* const row = hashTable + relRow;
BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
Explicit cast allows us to get exact desired position within each row */
U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
- assert(hash == ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
+ assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
- row[pos] = idx;
+ row[pos] = updateStartIdx;
}
+}
+
+/* ZSTD_row_update_internal():
+ * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+ * Skips sections of long matches as is necessary.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+ U32 const mls, U32 const rowLog,
+ U32 const rowMask, U32 const useCache)
+{
+ U32 idx = ms->nextToUpdate;
+ const BYTE* const base = ms->window.base;
+ const U32 target = (U32)(ip - base);
+ const U32 kSkipThreshold = 384;
+ const U32 kMaxMatchStartPositionsToUpdate = 96;
+ const U32 kMaxMatchEndPositionsToUpdate = 32;
+
+ if (useCache) {
+ /* Only skip positions when using hash cache, i.e.
+ * if we are loading a dict, don't skip anything.
+ * If we decide to skip, then we only update a set number
+ * of positions at the beginning and end of the match.
+ */
+ if (UNLIKELY(target - idx > kSkipThreshold)) {
+ U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
+ ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
+ idx = target - kMaxMatchEndPositionsToUpdate;
+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
+ }
+ }
+ assert(target >= idx);
+ ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
ms->nextToUpdate = target;
}
@@ -1216,7 +975,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const
* processing.
*/
void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
+ const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
const U32 rowMask = (1u << rowLog) - 1;
const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
@@ -1224,26 +983,131 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
}
+#if defined(ZSTD_ARCH_X86_SSE2)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
+{
+ const __m128i comparisonMask = _mm_set1_epi8((char)tag);
+ int matches[4] = {0};
+ int i;
+ assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
+ for (i=0; i<nbChunks; i++) {
+ const __m128i chunk = _mm_loadu_si128((const __m128i*)(const void*)(src + 16*i));
+ const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
+ matches[i] = _mm_movemask_epi8(equalMask);
+ }
+ if (nbChunks == 1) return ZSTD_rotateRight_U16((U16)matches[0], head);
+ if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
+ assert(nbChunks == 4);
+ return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
+}
+#endif
+
/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
* the hash at the nth position in a row of the tagTable.
- */
-FORCE_INLINE_TEMPLATE
-ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) {
- ZSTD_VecMask matches = 0;
- if (rowEntries == 16) {
- ZSTD_Vec128 hashes = ZSTD_Vec128_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
- ZSTD_Vec128 expandedTags = ZSTD_Vec128_set8(tag);
- matches = ZSTD_Vec128_cmpMask8(hashes, expandedTags);
- } else if (rowEntries == 32) {
- ZSTD_Vec256 hashes = ZSTD_Vec256_read(tagRow + ZSTD_ROW_HASH_TAG_OFFSET);
- ZSTD_Vec256 expandedTags = ZSTD_Vec256_set8(tag);
- matches = ZSTD_Vec256_cmpMask8(hashes, expandedTags);
- } else {
- assert(0);
+ * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
+ * to match up with the actual layout of the entries within the hashTable */
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
+{
+ const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+
+#if defined(ZSTD_ARCH_X86_SSE2)
+
+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
+
+#else /* SW or NEON-LE */
+
+# if defined(ZSTD_ARCH_ARM_NEON)
+ /* This NEON path only works for little endian - otherwise use SWAR below */
+ if (MEM_isLittleEndian()) {
+ if (rowEntries == 16) {
+ const uint8x16_t chunk = vld1q_u8(src);
+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+ const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
+ const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
+ const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
+ const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
+ const U16 hi = (U16)vgetq_lane_u8(t3, 8);
+ const U16 lo = (U16)vgetq_lane_u8(t3, 0);
+ return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+ } else if (rowEntries == 32) {
+ const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+ const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
+ const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
+ const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
+ const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
+ const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
+ const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
+ const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
+ const uint8x8x2_t t3 = vuzp_u8(t2, t0);
+ const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
+ const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
+ return ZSTD_rotateRight_U32(matches, head);
+ } else { /* rowEntries == 64 */
+ const uint8x16x4_t chunk = vld4q_u8(src);
+ const uint8x16_t dup = vdupq_n_u8(tag);
+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+
+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+ return ZSTD_rotateRight_U64(matches, head);
+ }
}
- /* Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
- to match up with the actual layout of the entries within the hashTable */
- return ZSTD_VecMask_rotateRight(matches, head, rowEntries);
+# endif /* ZSTD_ARCH_ARM_NEON */
+ /* SWAR */
+ { const size_t chunkSize = sizeof(size_t);
+ const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
+ const size_t xFF = ~((size_t)0);
+ const size_t x01 = xFF / 0xFF;
+ const size_t x80 = x01 << 7;
+ const size_t splatChar = tag * x01;
+ ZSTD_VecMask matches = 0;
+ int i = rowEntries - chunkSize;
+ assert((sizeof(size_t) == 4) || (sizeof(size_t) == 8));
+ if (MEM_isLittleEndian()) { /* runtime check so have two loops */
+ const size_t extractMagic = (xFF / 0x7F) >> chunkSize;
+ do {
+ size_t chunk = MEM_readST(&src[i]);
+ chunk ^= splatChar;
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
+ matches <<= chunkSize;
+ matches |= (chunk * extractMagic) >> shiftAmount;
+ i -= chunkSize;
+ } while (i >= 0);
+ } else { /* big endian: reverse bits during extraction */
+ const size_t msb = xFF ^ (xFF >> 1);
+ const size_t extractMagic = (msb / 0x1FF) | msb;
+ do {
+ size_t chunk = MEM_readST(&src[i]);
+ chunk ^= splatChar;
+ chunk = (((chunk | x80) - x01) | chunk) & x80;
+ matches <<= chunkSize;
+ matches |= ((chunk >> 7) * extractMagic) >> shiftAmount;
+ i -= chunkSize;
+ } while (i >= 0);
+ }
+ matches = ~matches;
+ if (rowEntries == 16) {
+ return ZSTD_rotateRight_U16((U16)matches, head);
+ } else if (rowEntries == 32) {
+ return ZSTD_rotateRight_U32((U32)matches, head);
+ } else {
+ return ZSTD_rotateRight_U64((U64)matches, head);
+ }
+ }
+#endif
}
/* The high-level approach of the SIMD row based match finder is as follows:
@@ -1262,7 +1126,7 @@ ZSTD_VecMask ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, con
* - Pick the longest match.
*/
FORCE_INLINE_TEMPLATE
-size_t ZSTD_RowFindBestMatch_generic (
+size_t ZSTD_RowFindBestMatch(
ZSTD_matchState_t* ms,
const BYTE* const ip, const BYTE* const iLimit,
size_t* offsetPtr,
@@ -1293,11 +1157,13 @@ size_t ZSTD_RowFindBestMatch_generic (
/* DMS/DDS variables that may be referenced laster */
const ZSTD_matchState_t* const dms = ms->dictMatchState;
- size_t ddsIdx;
- U32 ddsExtraAttempts; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
- U32 dmsTag;
- U32* dmsRow;
- BYTE* dmsTagRow;
+
+ /* Initialize the following variables to satisfy static analyzer */
+ size_t ddsIdx = 0;
+ U32 ddsExtraAttempts = 0; /* cctx hash tables are limited in searches, but allow extra searches into DDS */
+ U32 dmsTag = 0;
+ U32* dmsRow = NULL;
+ BYTE* dmsTagRow = NULL;
if (dictMode == ZSTD_dedicatedDictSearch) {
const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
@@ -1329,7 +1195,7 @@ size_t ZSTD_RowFindBestMatch_generic (
U32* const row = hashTable + relRow;
BYTE* tagRow = (BYTE*)(tagTable + relRow);
U32 const head = *tagRow & rowMask;
- U32 matchBuffer[32 /* maximum nb entries per row */];
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
size_t numMatches = 0;
size_t currMatch = 0;
ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
@@ -1379,12 +1245,13 @@ size_t ZSTD_RowFindBestMatch_generic (
/* Save best solution */
if (currentMl > ml) {
ml = currentMl;
- *offsetPtr = curr - matchIndex + ZSTD_REP_MOVE;
+ *offsetPtr = STORE_OFFSET(curr - matchIndex);
if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
}
}
}
+ assert(nbAttempts <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
if (dictMode == ZSTD_dedicatedDictSearch) {
ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
@@ -1397,7 +1264,7 @@ size_t ZSTD_RowFindBestMatch_generic (
const U32 dmsIndexDelta = dictLimit - dmsSize;
{ U32 const head = *dmsTagRow & rowMask;
- U32 matchBuffer[32 /* maximum nb row entries */];
+ U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
size_t numMatches = 0;
size_t currMatch = 0;
ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
@@ -1426,7 +1293,8 @@ size_t ZSTD_RowFindBestMatch_generic (
if (currentMl > ml) {
ml = currentMl;
- *offsetPtr = curr - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+ assert(curr > matchIndex + dmsIndexDelta);
+ *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
if (ip+currentMl == iLimit) break;
}
}
@@ -1435,84 +1303,175 @@ size_t ZSTD_RowFindBestMatch_generic (
return ml;
}
-/* Inlining is important to hardwire a hot branch (template emulation) */
-FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectMLS (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- const ZSTD_dictMode_e dictMode, size_t* offsetPtr, const U32 rowLog)
-{
- switch(ms->cParams.minMatch)
- {
- default : /* includes case 3 */
- case 4 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, dictMode, rowLog);
- case 5 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, dictMode, rowLog);
- case 7 :
- case 6 : return ZSTD_RowFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, dictMode, rowLog);
- }
-}
-FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_selectRowLog (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
- switch(cappedSearchLog)
- {
- default :
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 4);
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_noDict, offsetPtr, 5);
+typedef size_t (*searchMax_f)(
+ ZSTD_matchState_t* ms,
+ const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+
+/**
+ * This struct contains the functions necessary for lazy to search.
+ * Currently, that is only searchMax. However, it is still valuable to have the
+ * VTable because this makes it easier to add more functions to the VTable later.
+ *
+ * TODO: The start of the search function involves loading and calculating a
+ * bunch of constants from the ZSTD_matchState_t. These computations could be
+ * done in an initialization function, and saved somewhere in the match state.
+ * Then we could pass a pointer to the saved state instead of the match state,
+ * and avoid duplicate computations.
+ *
+ * TODO: Move the match re-winding into searchMax. This improves compression
+ * ratio, and unlocks further simplifications with the next TODO.
+ *
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
+ * and repcode search are in searchMax, there is no more logic in the match
+ * finder loop that requires knowledge about the dictMode. So we should be
+ * able to avoid force inlining it, and we can join the extDict loop with
+ * the single segment loop. It should go in searchMax instead of its own
+ * function to avoid having multiple virtual function calls per search.
+ */
+typedef struct {
+ searchMax_f searchMax;
+} ZSTD_LazyVTable;
+
+#define GEN_ZSTD_BT_VTABLE(dictMode, mls) \
+ static size_t ZSTD_BtFindBestMatch_##dictMode##_##mls( \
+ ZSTD_matchState_t* ms, \
+ const BYTE* ip, const BYTE* const iLimit, \
+ size_t* offsetPtr) \
+ { \
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
+ return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
+ } \
+ static const ZSTD_LazyVTable ZSTD_BtVTable_##dictMode##_##mls = { \
+ ZSTD_BtFindBestMatch_##dictMode##_##mls \
+ };
+
+#define GEN_ZSTD_HC_VTABLE(dictMode, mls) \
+ static size_t ZSTD_HcFindBestMatch_##dictMode##_##mls( \
+ ZSTD_matchState_t* ms, \
+ const BYTE* ip, const BYTE* const iLimit, \
+ size_t* offsetPtr) \
+ { \
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
+ return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
+ } \
+ static const ZSTD_LazyVTable ZSTD_HcVTable_##dictMode##_##mls = { \
+ ZSTD_HcFindBestMatch_##dictMode##_##mls \
+ };
+
+#define GEN_ZSTD_ROW_VTABLE(dictMode, mls, rowLog) \
+ static size_t ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog( \
+ ZSTD_matchState_t* ms, \
+ const BYTE* ip, const BYTE* const iLimit, \
+ size_t* offsetPtr) \
+ { \
+ assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
+ assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
+ return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
+ } \
+ static const ZSTD_LazyVTable ZSTD_RowVTable_##dictMode##_##mls##_##rowLog = { \
+ ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog \
+ };
+
+#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
+ X(dictMode, mls, 4) \
+ X(dictMode, mls, 5) \
+ X(dictMode, mls, 6)
+
+#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
+ ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
+
+#define ZSTD_FOR_EACH_MLS(X, dictMode) \
+ X(dictMode, 4) \
+ X(dictMode, 5) \
+ X(dictMode, 6)
+
+#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
+ X(__VA_ARGS__, noDict) \
+ X(__VA_ARGS__, extDict) \
+ X(__VA_ARGS__, dictMatchState) \
+ X(__VA_ARGS__, dedicatedDictSearch)
+
+/* Generate Row VTables for each combination of (dictMode, mls, rowLog) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_VTABLE)
+/* Generate Binary Tree VTables for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_VTABLE)
+/* Generate Hash Chain VTables for each combination of (dictMode, mls) */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_VTABLE)
+
+#define GEN_ZSTD_BT_VTABLE_ARRAY(dictMode) \
+ { \
+ &ZSTD_BtVTable_##dictMode##_4, \
+ &ZSTD_BtVTable_##dictMode##_5, \
+ &ZSTD_BtVTable_##dictMode##_6 \
}
-}
-FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dictMatchState_selectRowLog(
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
- switch(cappedSearchLog)
- {
- default :
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 4);
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dictMatchState, offsetPtr, 5);
+#define GEN_ZSTD_HC_VTABLE_ARRAY(dictMode) \
+ { \
+ &ZSTD_HcVTable_##dictMode##_4, \
+ &ZSTD_HcVTable_##dictMode##_5, \
+ &ZSTD_HcVTable_##dictMode##_6 \
}
-}
-FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog(
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
- switch(cappedSearchLog)
- {
- default :
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 4);
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_dedicatedDictSearch, offsetPtr, 5);
+#define GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, mls) \
+ { \
+ &ZSTD_RowVTable_##dictMode##_##mls##_4, \
+ &ZSTD_RowVTable_##dictMode##_##mls##_5, \
+ &ZSTD_RowVTable_##dictMode##_##mls##_6 \
}
-}
-FORCE_INLINE_TEMPLATE size_t ZSTD_RowFindBestMatch_extDict_selectRowLog (
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr)
-{
- const U32 cappedSearchLog = MIN(ms->cParams.searchLog, 5);
- switch(cappedSearchLog)
- {
- default :
- case 4 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 4);
- case 5 : return ZSTD_RowFindBestMatch_selectMLS(ms, ip, iLimit, ZSTD_extDict, offsetPtr, 5);
+#define GEN_ZSTD_ROW_VTABLE_ARRAY(dictMode) \
+ { \
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 4), \
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 5), \
+ GEN_ZSTD_ROW_VTABLE_ARRAY_(dictMode, 6) \
}
-}
+#define GEN_ZSTD_VTABLE_ARRAY(X) \
+ { \
+ X(noDict), \
+ X(extDict), \
+ X(dictMatchState), \
+ X(dedicatedDictSearch) \
+ }
/* *******************************
* Common parser - lazy strategy
*********************************/
typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searchMethod_e;
+/**
+ * This table is indexed first by the four ZSTD_dictMode_e values, and then
+ * by the two searchMethod_e values. NULLs are placed for configurations
+ * that should never occur (extDict modes go to the other implementation
+ * below and there is no DDSS for binary tree search yet).
+ */
+
+static ZSTD_LazyVTable const*
+ZSTD_selectLazyVTable(ZSTD_matchState_t const* ms, searchMethod_e searchMethod, ZSTD_dictMode_e dictMode)
+{
+ /* Fill the Hc/Bt VTable arrays with the right functions for the (dictMode, mls) combination. */
+ ZSTD_LazyVTable const* const hcVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_HC_VTABLE_ARRAY);
+ ZSTD_LazyVTable const* const btVTables[4][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_BT_VTABLE_ARRAY);
+ /* Fill the Row VTable array with the right functions for the (dictMode, mls, rowLog) combination. */
+ ZSTD_LazyVTable const* const rowVTables[4][3][3] = GEN_ZSTD_VTABLE_ARRAY(GEN_ZSTD_ROW_VTABLE_ARRAY);
+
+ U32 const mls = MAX(4, MIN(6, ms->cParams.minMatch));
+ U32 const rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
+ switch (searchMethod) {
+ case search_hashChain:
+ return hcVTables[dictMode][mls - 4];
+ case search_binaryTree:
+ return btVTables[dictMode][mls - 4];
+ case search_rowHash:
+ return rowVTables[dictMode][mls - 4][rowLog - 4];
+ default:
+ return NULL;
+ }
+}
+
FORCE_INLINE_TEMPLATE size_t
ZSTD_compressBlock_lazy_generic(
ZSTD_matchState_t* ms, seqStore_t* seqStore,
@@ -1525,46 +1484,12 @@ ZSTD_compressBlock_lazy_generic(
const BYTE* ip = istart;
const BYTE* anchor = istart;
const BYTE* const iend = istart + srcSize;
- const BYTE* const ilimit = searchMethod == search_rowHash ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
+ const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
const BYTE* const base = ms->window.base;
const U32 prefixLowestIndex = ms->window.dictLimit;
const BYTE* const prefixLowest = base + prefixLowestIndex;
- const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
- typedef size_t (*searchMax_f)(
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
-
- /**
- * This table is indexed first by the four ZSTD_dictMode_e values, and then
- * by the two searchMethod_e values. NULLs are placed for configurations
- * that should never occur (extDict modes go to the other implementation
- * below and there is no DDSS for binary tree search yet).
- */
- const searchMax_f searchFuncs[4][3] = {
- {
- ZSTD_HcFindBestMatch_selectMLS,
- ZSTD_BtFindBestMatch_selectMLS,
- ZSTD_RowFindBestMatch_selectRowLog
- },
- {
- NULL,
- NULL,
- NULL
- },
- {
- ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
- ZSTD_BtFindBestMatch_dictMatchState_selectMLS,
- ZSTD_RowFindBestMatch_dictMatchState_selectRowLog
- },
- {
- ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
- NULL,
- ZSTD_RowFindBestMatch_dedicatedDictSearch_selectRowLog
- }
- };
-
- searchMax_f const searchMax = searchFuncs[dictMode][(int)searchMethod];
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, dictMode)->searchMax;
U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
const int isDMS = dictMode == ZSTD_dictMatchState;
@@ -1599,6 +1524,7 @@ ZSTD_compressBlock_lazy_generic(
}
if (searchMethod == search_rowHash) {
+ const U32 rowLog = MAX(4, MIN(6, ms->cParams.searchLog));
ZSTD_row_fillHashCache(ms, base, rowLog,
MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
ms->nextToUpdate, ilimit);
@@ -1613,8 +1539,9 @@ ZSTD_compressBlock_lazy_generic(
#endif
while (ip < ilimit) {
size_t matchLength=0;
- size_t offset=0;
+ size_t offcode=STORE_REPCODE_1;
const BYTE* start=ip+1;
+ DEBUGLOG(7, "search baseline (depth 0)");
/* check repCode */
if (isDxS) {
@@ -1640,7 +1567,7 @@ ZSTD_compressBlock_lazy_generic(
{ size_t offsetFound = 999999999;
size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
if (ml2 > matchLength)
- matchLength = ml2, start = ip, offset=offsetFound;
+ matchLength = ml2, start = ip, offcode=offsetFound;
}
if (matchLength < 4) {
@@ -1651,14 +1578,15 @@ ZSTD_compressBlock_lazy_generic(
/* let's try to find a better solution */
if (depth>=1)
while (ip<ilimit) {
+ DEBUGLOG(7, "search depth 1");
ip ++;
if ( (dictMode == ZSTD_noDict)
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+ && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
int const gain2 = (int)(mlRep * 3);
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
if ((mlRep >= 4) && (gain2 > gain1))
- matchLength = mlRep, offset = 0, start = ip;
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
}
if (isDxS) {
const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1670,30 +1598,31 @@ ZSTD_compressBlock_lazy_generic(
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
int const gain2 = (int)(mlRep * 3);
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
if ((mlRep >= 4) && (gain2 > gain1))
- matchLength = mlRep, offset = 0, start = ip;
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
}
}
{ size_t offset2=999999999;
size_t const ml2 = searchMax(ms, ip, iend, &offset2);
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
if ((ml2 >= 4) && (gain2 > gain1)) {
- matchLength = ml2, offset = offset2, start = ip;
+ matchLength = ml2, offcode = offset2, start = ip;
continue; /* search a better one */
} }
/* let's find an even better one */
if ((depth==2) && (ip<ilimit)) {
+ DEBUGLOG(7, "search depth 2");
ip ++;
if ( (dictMode == ZSTD_noDict)
- && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+ && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
int const gain2 = (int)(mlRep * 4);
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
if ((mlRep >= 4) && (gain2 > gain1))
- matchLength = mlRep, offset = 0, start = ip;
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
}
if (isDxS) {
const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1705,46 +1634,45 @@ ZSTD_compressBlock_lazy_generic(
const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
int const gain2 = (int)(mlRep * 4);
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
if ((mlRep >= 4) && (gain2 > gain1))
- matchLength = mlRep, offset = 0, start = ip;
+ matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
}
}
{ size_t offset2=999999999;
size_t const ml2 = searchMax(ms, ip, iend, &offset2);
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
if ((ml2 >= 4) && (gain2 > gain1)) {
- matchLength = ml2, offset = offset2, start = ip;
+ matchLength = ml2, offcode = offset2, start = ip;
continue;
} } }
break; /* nothing found : store previous solution */
}
/* NOTE:
- * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
- * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
- * overflows the pointer, which is undefined behavior.
+ * Pay attention that `start[-value]` can lead to strange undefined behavior
+ * notably if `value` is unsigned, resulting in a large positive `-value`.
*/
/* catch up */
- if (offset) {
+ if (STORED_IS_OFFSET(offcode)) {
if (dictMode == ZSTD_noDict) {
- while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
- && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
+ while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
+ && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */
{ start--; matchLength++; }
}
if (isDxS) {
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+ U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
}
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+ offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
}
/* store sequence */
_storeSequence:
- { size_t const litLength = start - anchor;
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
+ { size_t const litLength = (size_t)(start - anchor);
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
anchor = ip = start + matchLength;
}
@@ -1760,8 +1688,8 @@ _storeSequence:
&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {
const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+ offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
ip += matchLength;
anchor = ip;
continue;
@@ -1775,8 +1703,8 @@ _storeSequence:
&& (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
/* store sequence */
matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+ offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
ip += matchLength;
anchor = ip;
continue; /* faster when present ... (?) */
@@ -1955,15 +1883,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const U32 windowLog = ms->cParams.windowLog;
const U32 rowLog = ms->cParams.searchLog < 5 ? 4 : 5;
- typedef size_t (*searchMax_f)(
- ZSTD_matchState_t* ms,
- const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
- const searchMax_f searchFuncs[3] = {
- ZSTD_HcFindBestMatch_extDict_selectMLS,
- ZSTD_BtFindBestMatch_extDict_selectMLS,
- ZSTD_RowFindBestMatch_extDict_selectRowLog
- };
- searchMax_f searchMax = searchFuncs[(int)searchMethod];
+ searchMax_f const searchMax = ZSTD_selectLazyVTable(ms, searchMethod, ZSTD_extDict)->searchMax;
U32 offset_1 = rep[0], offset_2 = rep[1];
DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
@@ -1985,7 +1905,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
#endif
while (ip < ilimit) {
size_t matchLength=0;
- size_t offset=0;
+ size_t offcode=STORE_REPCODE_1;
const BYTE* start=ip+1;
U32 curr = (U32)(ip-base);
@@ -1995,7 +1915,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */
- & (offset_1 < curr+1 - windowLow) ) /* note: we are searching at curr+1 */
+ & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */
if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
/* repcode detected we should take it */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
@@ -2007,10 +1927,10 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
{ size_t offsetFound = 999999999;
size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
if (ml2 > matchLength)
- matchLength = ml2, start = ip, offset=offsetFound;
+ matchLength = ml2, start = ip, offcode=offsetFound;
}
- if (matchLength < 4) {
+ if (matchLength < 4) {
ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */
continue;
}
@@ -2021,30 +1941,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
ip ++;
curr++;
/* check repCode */
- if (offset) {
+ if (offcode) {
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
const U32 repIndex = (U32)(curr - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
- & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
if (MEM_read32(ip) == MEM_read32(repMatch)) {
/* repcode detected */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
int const gain2 = (int)(repLength * 3);
- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
if ((repLength >= 4) && (gain2 > gain1))
- matchLength = repLength, offset = 0, start = ip;
+ matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
} }
/* search match, depth 1 */
{ size_t offset2=999999999;
size_t const ml2 = searchMax(ms, ip, iend, &offset2);
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
if ((ml2 >= 4) && (gain2 > gain1)) {
- matchLength = ml2, offset = offset2, start = ip;
+ matchLength = ml2, offcode = offset2, start = ip;
continue; /* search a better one */
} }
@@ -2053,48 +1973,48 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
ip ++;
curr++;
/* check repCode */
- if (offset) {
+ if (offcode) {
const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
const U32 repIndex = (U32)(curr - offset_1);
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
- & (offset_1 < curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+ & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
if (MEM_read32(ip) == MEM_read32(repMatch)) {
/* repcode detected */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
int const gain2 = (int)(repLength * 4);
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
if ((repLength >= 4) && (gain2 > gain1))
- matchLength = repLength, offset = 0, start = ip;
+ matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
} }
/* search match, depth 2 */
{ size_t offset2=999999999;
size_t const ml2 = searchMax(ms, ip, iend, &offset2);
- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */
- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */
+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
if ((ml2 >= 4) && (gain2 > gain1)) {
- matchLength = ml2, offset = offset2, start = ip;
+ matchLength = ml2, offcode = offset2, start = ip;
continue;
} } }
break; /* nothing found : store previous solution */
}
/* catch up */
- if (offset) {
- U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+ if (STORED_IS_OFFSET(offcode)) {
+ U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */
- offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+ offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
}
/* store sequence */
_storeSequence:
- { size_t const litLength = start - anchor;
- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
+ { size_t const litLength = (size_t)(start - anchor);
+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
anchor = ip = start + matchLength;
}
@@ -2106,13 +2026,13 @@ _storeSequence:
const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
const BYTE* const repMatch = repBase + repIndex;
if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */
- & (offset_2 < repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
+ & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */
if (MEM_read32(ip) == MEM_read32(repMatch)) {
/* repcode detected we should take it */
const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
- offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */
- ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+ offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */
+ ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
ip += matchLength;
anchor = ip;
continue; /* faster when present ... (?) */
diff --git a/thirdparty/zstd/compress/zstd_ldm.c b/thirdparty/zstd/compress/zstd_ldm.c
index fa4ebeabd7..f662b2546e 100644
--- a/thirdparty/zstd/compress/zstd_ldm.c
+++ b/thirdparty/zstd/compress/zstd_ldm.c
@@ -159,12 +159,12 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params)
size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize)
+ ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t));
- return params.enableLdm ? totalSize : 0;
+ return params.enableLdm == ZSTD_ps_enable ? totalSize : 0;
}
size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
{
- return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0;
+ return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0;
}
/** ZSTD_ldm_getBucket() :
@@ -478,7 +478,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
*/
if (anchor > ip + hashed) {
ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength);
- /* Continue the outter loop at anchor (ip + hashed == anchor). */
+ /* Continue the outer loop at anchor (ip + hashed == anchor). */
ip = anchor - hashed;
break;
}
@@ -579,7 +579,9 @@ size_t ZSTD_ldm_generateSequences(
return 0;
}
-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
+void
+ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch)
+{
while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
if (srcSize <= seq->litLength) {
@@ -657,7 +659,7 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
- ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+ ZSTD_paramSwitch_e useRowMatchFinder,
void const* src, size_t srcSize)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -709,8 +711,8 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
rep[0] = sequence.offset;
/* Store the sequence */
ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
- sequence.offset + ZSTD_REP_MOVE,
- sequence.matchLength - MINMATCH);
+ STORE_OFFSET(sequence.offset),
+ sequence.matchLength);
ip += sequence.matchLength;
}
}
diff --git a/thirdparty/zstd/compress/zstd_ldm.h b/thirdparty/zstd/compress/zstd_ldm.h
index 393466fa9f..4e68dbf52e 100644
--- a/thirdparty/zstd/compress/zstd_ldm.h
+++ b/thirdparty/zstd/compress/zstd_ldm.h
@@ -66,7 +66,7 @@ size_t ZSTD_ldm_generateSequences(
*/
size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
- ZSTD_useRowMatchFinderMode_e useRowMatchFinder,
+ ZSTD_paramSwitch_e useRowMatchFinder,
void const* src, size_t srcSize);
/**
diff --git a/thirdparty/zstd/compress/zstd_ldm_geartab.h b/thirdparty/zstd/compress/zstd_ldm_geartab.h
index e5c24d856b..647f865be2 100644
--- a/thirdparty/zstd/compress/zstd_ldm_geartab.h
+++ b/thirdparty/zstd/compress/zstd_ldm_geartab.h
@@ -11,7 +11,10 @@
#ifndef ZSTD_LDM_GEARTAB_H
#define ZSTD_LDM_GEARTAB_H
-static U64 ZSTD_ldm_gearTab[256] = {
+#include "../common/compiler.h" /* UNUSED_ATTR */
+#include "../common/mem.h" /* U64 */
+
+static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] = {
0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc,
0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05,
0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e,
diff --git a/thirdparty/zstd/compress/zstd_opt.c b/thirdparty/zstd/compress/zstd_opt.c
index 402a7e5c76..1b1ddad428 100644
--- a/thirdparty/zstd/compress/zstd_opt.c
+++ b/thirdparty/zstd/compress/zstd_opt.c
@@ -14,7 +14,6 @@
#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
-#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */
#define ZSTD_MAX_PRICE (1<<30)
#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
@@ -24,11 +23,11 @@
* Price functions for optimal parser
***************************************/
-#if 0 /* approximation at bit level */
+#if 0 /* approximation at bit level (for tests) */
# define BITCOST_ACCURACY 0
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-# define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat))
-#elif 0 /* fractional bit accuracy */
+# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+#elif 0 /* fractional bit accuracy (for tests) */
# define BITCOST_ACCURACY 8
# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
@@ -66,7 +65,7 @@ MEM_STATIC double ZSTD_fCost(U32 price)
static int ZSTD_compressedLiterals(optState_t const* const optPtr)
{
- return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed;
+ return optPtr->literalCompressionMode != ZSTD_ps_disable;
}
static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
@@ -79,25 +78,46 @@ static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
}
-/* ZSTD_downscaleStat() :
- * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus)
- * return the resulting sum of elements */
-static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus)
+static U32 sum_u32(const unsigned table[], size_t nbElts)
+{
+ size_t n;
+ U32 total = 0;
+ for (n=0; n<nbElts; n++) {
+ total += table[n];
+ }
+ return total;
+}
+
+static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
{
U32 s, sum=0;
- DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1);
- assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31);
+ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
+ assert(shift < 30);
for (s=0; s<lastEltIndex+1; s++) {
- table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus));
+ table[s] = 1 + (table[s] >> shift);
sum += table[s];
}
return sum;
}
+/* ZSTD_scaleStats() :
+ * reduce all elements in table is sum too large
+ * return the resulting sum of elements */
+static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+{
+ U32 const prevsum = sum_u32(table, lastEltIndex+1);
+ U32 const factor = prevsum >> logTarget;
+ DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
+ assert(logTarget < 30);
+ if (factor <= 1) return prevsum;
+ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
+}
+
/* ZSTD_rescaleFreqs() :
* if first block (detected by optPtr->litLengthSum == 0) : init statistics
* take hints from dictionary if there is one
- * or init from zero, using src for literals stats, or flat 1 for match symbols
+ * and init from zero if there is none,
+ * using src for literals stats, and baseline stats for sequence symbols
* otherwise downscale existing stats, to be used as seed for next block.
*/
static void
@@ -126,7 +146,7 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
optPtr->litSum = 0;
for (lit=0; lit<=MaxLit; lit++) {
U32 const scaleLog = 11; /* scale to 2K */
- U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit);
+ U32 const bitCost = HUF_getNbBitsFromCTable(optPtr->symbolCosts->huf.CTable, lit);
assert(bitCost <= scaleLog);
optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
optPtr->litSum += optPtr->litFreq[lit];
@@ -174,14 +194,19 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
if (compressedLiterals) {
unsigned lit = MaxLit;
HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */
- optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
}
- { unsigned ll;
- for (ll=0; ll<=MaxLL; ll++)
- optPtr->litLengthFreq[ll] = 1;
+ { unsigned const baseLLfreqs[MaxLL+1] = {
+ 4, 2, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1
+ };
+ ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(baseLLfreqs));
+ optPtr->litLengthSum = sum_u32(baseLLfreqs, MaxLL+1);
}
- optPtr->litLengthSum = MaxLL+1;
{ unsigned ml;
for (ml=0; ml<=MaxML; ml++)
@@ -189,21 +214,26 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
}
optPtr->matchLengthSum = MaxML+1;
- { unsigned of;
- for (of=0; of<=MaxOff; of++)
- optPtr->offCodeFreq[of] = 1;
+ { unsigned const baseOFCfreqs[MaxOff+1] = {
+ 6, 2, 1, 1, 2, 3, 4, 4,
+ 4, 3, 2, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1
+ };
+ ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(baseOFCfreqs));
+ optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
}
- optPtr->offCodeSum = MaxOff+1;
+
}
} else { /* new block : re-use previous statistics, scaled down */
if (compressedLiterals)
- optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
- optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0);
- optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0);
- optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0);
+ optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
+ optPtr->litLengthSum = ZSTD_scaleStats(optPtr->litLengthFreq, MaxLL, 11);
+ optPtr->matchLengthSum = ZSTD_scaleStats(optPtr->matchLengthFreq, MaxML, 11);
+ optPtr->offCodeSum = ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff, 11);
}
ZSTD_setBasePrices(optPtr, optLevel);
@@ -239,7 +269,16 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
* cost of literalLength symbol */
static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
{
- if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel);
+ assert(litLength <= ZSTD_BLOCKSIZE_MAX);
+ if (optPtr->priceType == zop_predef)
+ return WEIGHT(litLength, optLevel);
+ /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+ * because it isn't representable in the zstd format. So instead just
+ * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
+ * would be all literals.
+ */
+ if (litLength == ZSTD_BLOCKSIZE_MAX)
+ return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
/* dynamic statistics */
{ U32 const llCode = ZSTD_LLcode(litLength);
@@ -252,15 +291,17 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
/* ZSTD_getMatchPrice() :
* Provides the cost of the match part (offset + matchLength) of a sequence
* Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
- * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
+ * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
+ * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
+ */
FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offset,
+ZSTD_getMatchPrice(U32 const offcode,
U32 const matchLength,
const optState_t* const optPtr,
int const optLevel)
{
U32 price;
- U32 const offCode = ZSTD_highbit32(offset+1);
+ U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
U32 const mlBase = matchLength - MINMATCH;
assert(matchLength >= MINMATCH);
@@ -303,8 +344,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
optPtr->litLengthSum++;
}
- /* match offset code (0-2=>repCode; 3+=>offset+2) */
- { U32 const offCode = ZSTD_highbit32(offsetCode+1);
+ /* offset code : expected to follow storeSeq() numeric representation */
+ { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
assert(offCode <= MaxOff);
optPtr->offCodeFreq[offCode]++;
optPtr->offCodeSum++;
@@ -338,7 +379,7 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
/* Update hashTable3 up to ip (excluded)
Assumption : always within prefix (i.e. not within extDict) */
-static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms,
+static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
U32* nextToUpdate3,
const BYTE* const ip)
{
@@ -364,11 +405,13 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms,
* Binary Tree search
***************************************/
/** ZSTD_insertBt1() : add one or multiple positions to tree.
- * ip : assumed <= iend-8 .
+ * @param ip assumed <= iend-8 .
+ * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
* @return : nb of positions added */
static U32 ZSTD_insertBt1(
- ZSTD_matchState_t* ms,
+ const ZSTD_matchState_t* ms,
const BYTE* const ip, const BYTE* const iend,
+ U32 const target,
U32 const mls, const int extDict)
{
const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -391,7 +434,10 @@ static U32 ZSTD_insertBt1(
U32* smallerPtr = bt + 2*(curr&btMask);
U32* largerPtr = smallerPtr + 1;
U32 dummy32; /* to be nullified at the end */
- U32 const windowLow = ms->window.lowLimit;
+ /* windowLow is based on target because
+ * we only need positions that will be in the window at the end of the tree update.
+ */
+ U32 const windowLow = ZSTD_getLowestMatchIndex(ms, target, cParams->windowLog);
U32 matchEndIdx = curr+8+1;
size_t bestLength = 8;
U32 nbCompares = 1U << cParams->searchLog;
@@ -404,11 +450,12 @@ static U32 ZSTD_insertBt1(
DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr);
+ assert(curr <= target);
assert(ip <= iend-8); /* required for h calculation */
hashTable[h] = curr; /* Update Hash Table */
assert(windowLow > 0);
- while (nbCompares-- && (matchIndex >= windowLow)) {
+ for (; nbCompares && (matchIndex >= windowLow); --nbCompares) {
U32* const nextPtr = bt + 2*(matchIndex & btMask);
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
assert(matchIndex < curr);
@@ -492,7 +539,7 @@ void ZSTD_updateTree_internal(
idx, target, dictMode);
while(idx < target) {
- U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict);
+ U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, target, mls, dictMode == ZSTD_extDict);
assert(idx < (U32)(idx + forward));
idx += forward;
}
@@ -597,7 +644,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
repCode, ll0, repOffset, repLen);
bestLength = repLen;
- matches[mnum].off = repCode - ll0;
+ matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */
matches[mnum].len = (U32)repLen;
mnum++;
if ( (repLen > sufficient_len)
@@ -626,7 +673,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
bestLength = mlen;
assert(curr > matchIndex3);
assert(mnum==0); /* no prior solution */
- matches[0].off = (curr - matchIndex3) + ZSTD_REP_MOVE;
+ matches[0].off = STORE_OFFSET(curr - matchIndex3);
matches[0].len = (U32)mlen;
mnum = 1;
if ( (mlen > sufficient_len) |
@@ -635,11 +682,11 @@ U32 ZSTD_insertBtAndGetAllMatches (
return 1;
} } }
/* no dictMatchState lookup: dicts don't have a populated HC3 table */
- }
+ } /* if (mls == 3) */
hashTable[h] = curr; /* Update Hash Table */
- while (nbCompares-- && (matchIndex >= matchLow)) {
+ for (; nbCompares && (matchIndex >= matchLow); --nbCompares) {
U32* const nextPtr = bt + 2*(matchIndex & btMask);
const BYTE* match;
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
@@ -660,20 +707,19 @@ U32 ZSTD_insertBtAndGetAllMatches (
if (matchLength > bestLength) {
DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
- (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE);
+ (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
assert(matchEndIdx > matchIndex);
if (matchLength > matchEndIdx - matchIndex)
matchEndIdx = matchIndex + (U32)matchLength;
bestLength = matchLength;
- matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE;
+ matches[mnum].off = STORE_OFFSET(curr - matchIndex);
matches[mnum].len = (U32)matchLength;
mnum++;
if ( (matchLength > ZSTD_OPT_NUM)
| (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
break; /* drop, to preserve bt consistency (miss a little bit of compression) */
- }
- }
+ } }
if (match[matchLength] < ip[matchLength]) {
/* match smaller than current */
@@ -692,12 +738,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
*smallerPtr = *largerPtr = 0;
+ assert(nbCompares <= (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven't underflowed. */
if (dictMode == ZSTD_dictMatchState && nbCompares) {
size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
U32 dictMatchIndex = dms->hashTable[dmsH];
const U32* const dmsBt = dms->chainTable;
commonLengthSmaller = commonLengthLarger = 0;
- while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) {
+ for (; nbCompares && (dictMatchIndex > dmsLowLimit); --nbCompares) {
const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */
const BYTE* match = dmsBase + dictMatchIndex;
@@ -708,18 +755,17 @@ U32 ZSTD_insertBtAndGetAllMatches (
if (matchLength > bestLength) {
matchIndex = dictMatchIndex + dmsIndexDelta;
DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
- (U32)matchLength, curr - matchIndex, curr - matchIndex + ZSTD_REP_MOVE);
+ (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
if (matchLength > matchEndIdx - matchIndex)
matchEndIdx = matchIndex + (U32)matchLength;
bestLength = matchLength;
- matches[mnum].off = (curr - matchIndex) + ZSTD_REP_MOVE;
+ matches[mnum].off = STORE_OFFSET(curr - matchIndex);
matches[mnum].len = (U32)matchLength;
mnum++;
if ( (matchLength > ZSTD_OPT_NUM)
| (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
break; /* drop, to guarantee consistency (miss a little bit of compression) */
- }
- }
+ } }
if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */
if (match[matchLength] < ip[matchLength]) {
@@ -729,39 +775,91 @@ U32 ZSTD_insertBtAndGetAllMatches (
/* match is larger than current */
commonLengthLarger = matchLength;
dictMatchIndex = nextPtr[0];
- }
- }
- }
+ } } } /* if (dictMode == ZSTD_dictMatchState) */
assert(matchEndIdx > curr+8);
ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
return mnum;
}
-
-FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
- ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */
- ZSTD_matchState_t* ms,
- U32* nextToUpdate3,
- const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
- const U32 rep[ZSTD_REP_NUM],
- U32 const ll0,
- U32 const lengthToBeat)
+typedef U32 (*ZSTD_getAllMatchesFn)(
+ ZSTD_match_t*,
+ ZSTD_matchState_t*,
+ U32*,
+ const BYTE*,
+ const BYTE*,
+ const U32 rep[ZSTD_REP_NUM],
+ U32 const ll0,
+ U32 const lengthToBeat);
+
+FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
+ ZSTD_match_t* matches,
+ ZSTD_matchState_t* ms,
+ U32* nextToUpdate3,
+ const BYTE* ip,
+ const BYTE* const iHighLimit,
+ const U32 rep[ZSTD_REP_NUM],
+ U32 const ll0,
+ U32 const lengthToBeat,
+ const ZSTD_dictMode_e dictMode,
+ const U32 mls)
{
- const ZSTD_compressionParameters* const cParams = &ms->cParams;
- U32 const matchLengthSearch = cParams->minMatch;
- DEBUGLOG(8, "ZSTD_BtGetAllMatches");
- if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */
- ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode);
- switch(matchLengthSearch)
- {
- case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3);
- default :
- case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4);
- case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5);
- case 7 :
- case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6);
+ assert(BOUNDED(3, ms->cParams.minMatch, 6) == mls);
+ DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=%d, mls=%u)", (int)dictMode, mls);
+ if (ip < ms->window.base + ms->nextToUpdate)
+ return 0; /* skipped area */
+ ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode);
+ return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, mls);
+}
+
+#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##dictMode##_##mls
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \
+ static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \
+ ZSTD_match_t* matches, \
+ ZSTD_matchState_t* ms, \
+ U32* nextToUpdate3, \
+ const BYTE* ip, \
+ const BYTE* const iHighLimit, \
+ const U32 rep[ZSTD_REP_NUM], \
+ U32 const ll0, \
+ U32 const lengthToBeat) \
+ { \
+ return ZSTD_btGetAllMatches_internal( \
+ matches, ms, nextToUpdate3, ip, iHighLimit, \
+ rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \
+ }
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \
+ GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \
+ GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \
+ GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \
+ GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6)
+
+GEN_ZSTD_BT_GET_ALL_MATCHES(noDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(extDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
+
+#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode) \
+ { \
+ ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \
+ ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \
+ ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \
+ ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6) \
}
+
+static ZSTD_getAllMatchesFn
+ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode)
+{
+ ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = {
+ ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
+ ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict),
+ ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState)
+ };
+ U32 const mls = BOUNDED(3, ms->cParams.minMatch, 6);
+ assert((U32)dictMode < 3);
+ assert(mls - 3 < 4);
+ return getAllMatchesFns[(int)dictMode][mls - 3];
}
/*************************
@@ -770,16 +868,18 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
/* Struct containing info needed to make decision about ldm inclusion */
typedef struct {
- rawSeqStore_t seqStore; /* External match candidates store for this block */
- U32 startPosInBlock; /* Start position of the current match candidate */
- U32 endPosInBlock; /* End position of the current match candidate */
- U32 offset; /* Offset of the match candidate */
+ rawSeqStore_t seqStore; /* External match candidates store for this block */
+ U32 startPosInBlock; /* Start position of the current match candidate */
+ U32 endPosInBlock; /* End position of the current match candidate */
+ U32 offset; /* Offset of the match candidate */
} ZSTD_optLdm_t;
/* ZSTD_optLdm_skipRawSeqStoreBytes():
- * Moves forward in rawSeqStore by nbBytes, which will update the fields 'pos' and 'posInSequence'.
+ * Moves forward in @rawSeqStore by @nbBytes,
+ * which will update the fields 'pos' and 'posInSequence'.
*/
-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) {
+static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes)
+{
U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes);
while (currPos && rawSeqStore->pos < rawSeqStore->size) {
rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos];
@@ -800,8 +900,10 @@ static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t
* Calculates the beginning and end of the next match in the current block.
* Updates 'pos' and 'posInSequence' of the ldmSeqStore.
*/
-static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
- U32 blockBytesRemaining) {
+static void
+ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock,
+ U32 blockBytesRemaining)
+{
rawSeq currSeq;
U32 currBlockEndPos;
U32 literalsBytesRemaining;
@@ -813,8 +915,8 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 cu
optLdm->endPosInBlock = UINT_MAX;
return;
}
- /* Calculate appropriate bytes left in matchLength and litLength after adjusting
- based on ldmSeqStore->posInSequence */
+ /* Calculate appropriate bytes left in matchLength and litLength
+ * after adjusting based on ldmSeqStore->posInSequence */
currSeq = optLdm->seqStore.seq[optLdm->seqStore.pos];
assert(optLdm->seqStore.posInSequence <= currSeq.litLength + currSeq.matchLength);
currBlockEndPos = currPosInBlock + blockBytesRemaining;
@@ -850,15 +952,16 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 cu
}
/* ZSTD_optLdm_maybeAddMatch():
- * Adds a match if it's long enough, based on it's 'matchStartPosInBlock'
- * and 'matchEndPosInBlock', into 'matches'. Maintains the correct ordering of 'matches'
+ * Adds a match if it's long enough,
+ * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock',
+ * into 'matches'. Maintains the correct ordering of 'matches'.
*/
static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
- ZSTD_optLdm_t* optLdm, U32 currPosInBlock) {
- U32 posDiff = currPosInBlock - optLdm->startPosInBlock;
+ const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
+{
+ U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
/* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
- U32 candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
- U32 candidateOffCode = optLdm->offset + ZSTD_REP_MOVE;
+ U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
/* Ensure that current block position is not outside of the match */
if (currPosInBlock < optLdm->startPosInBlock
@@ -868,6 +971,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
}
if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+ U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
candidateOffCode, candidateMatchLength, currPosInBlock);
matches[*nbMatches].len = candidateMatchLength;
@@ -879,8 +983,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
/* ZSTD_optLdm_processMatchCandidate():
* Wrapper function to update ldm seq store and call ldm functions as necessary.
*/
-static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_t* matches, U32* nbMatches,
- U32 currPosInBlock, U32 remainingBytes) {
+static void
+ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+ ZSTD_match_t* matches, U32* nbMatches,
+ U32 currPosInBlock, U32 remainingBytes)
+{
if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) {
return;
}
@@ -891,19 +998,19 @@ static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_match_
* at the end of a match from the ldm seq store, and will often be some bytes
* over beyond matchEndPosInBlock. As such, we need to correct for these "overshoots"
*/
- U32 posOvershoot = currPosInBlock - optLdm->endPosInBlock;
+ U32 const posOvershoot = currPosInBlock - optLdm->endPosInBlock;
ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOvershoot);
- }
+ }
ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes);
}
ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
}
+
/*-*******************************
* Optimal parser
*********************************/
-
static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
{
return sol.litlen + sol.mlen;
@@ -944,6 +1051,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
const BYTE* const prefixStart = base + ms->window.dictLimit;
const ZSTD_compressionParameters* const cParams = &ms->cParams;
+ ZSTD_getAllMatchesFn getAllMatches = ZSTD_selectBtGetAllMatches(ms, dictMode);
+
U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
U32 nextToUpdate3 = ms->nextToUpdate;
@@ -971,7 +1080,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
/* find first match */
{ U32 const litlen = (U32)(ip - anchor);
U32 const ll0 = !litlen;
- U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch);
+ U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
(U32)(ip-istart), (U32)(iend - ip));
if (!nbMatches) { ip++; continue; }
@@ -985,18 +1094,18 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
* in every price. We include the literal length to avoid negative
* prices when we subtract the previous literal length.
*/
- opt[0].price = ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
+ opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
/* large match -> immediate encoding */
{ U32 const maxML = matches[nbMatches-1].len;
- U32 const maxOffset = matches[nbMatches-1].off;
+ U32 const maxOffcode = matches[nbMatches-1].off;
DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
- nbMatches, maxML, maxOffset, (U32)(ip-prefixStart));
+ nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
if (maxML > sufficient_len) {
lastSequence.litlen = litlen;
lastSequence.mlen = maxML;
- lastSequence.off = maxOffset;
+ lastSequence.off = maxOffcode;
DEBUGLOG(6, "large match (%u>%u), immediate encoding",
maxML, sufficient_len);
cur = 0;
@@ -1005,24 +1114,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
} }
/* set prices for first matches starting position == 0 */
- { U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+ assert(opt[0].price >= 0);
+ { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
U32 pos;
U32 matchNb;
for (pos = 1; pos < minMatch; pos++) {
opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */
}
for (matchNb = 0; matchNb < nbMatches; matchNb++) {
- U32 const offset = matches[matchNb].off;
+ U32 const offcode = matches[matchNb].off;
U32 const end = matches[matchNb].len;
for ( ; pos <= end ; pos++ ) {
- U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
+ U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
U32 const sequencePrice = literalsPrice + matchPrice;
DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
pos, ZSTD_fCost(sequencePrice));
opt[pos].mlen = pos;
- opt[pos].off = offset;
+ opt[pos].off = offcode;
opt[pos].litlen = litlen;
- opt[pos].price = sequencePrice;
+ opt[pos].price = (int)sequencePrice;
} }
last_pos = pos-1;
}
@@ -1037,9 +1147,9 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
/* Fix current position with one literal if cheaper */
{ U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
int const price = opt[cur-1].price
- + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
- + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
- - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
+ + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+ + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+ - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
assert(price < 1000000000); /* overflow check */
if (price <= opt[cur].price) {
DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
@@ -1065,7 +1175,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
assert(cur >= opt[cur].mlen);
if (opt[cur].mlen != 0) {
U32 const prev = cur - opt[cur].mlen;
- repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
+ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
} else {
ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
@@ -1082,11 +1192,12 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
}
+ assert(opt[cur].price >= 0);
{ U32 const ll0 = (opt[cur].mlen != 0);
U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
- U32 const previousPrice = opt[cur].price;
+ U32 const previousPrice = (U32)opt[cur].price;
U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
- U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch);
+ U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
U32 matchNb;
ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
@@ -1124,7 +1235,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */
U32 const pos = cur + mlen;
- int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
if ((pos > last_pos) || (price < opt[pos].price)) {
DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
@@ -1154,7 +1265,7 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
* update them while traversing the sequences.
*/
if (lastSequence.mlen != 0) {
- repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+ repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
ZSTD_memcpy(rep, &reps, sizeof(reps));
} else {
ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
@@ -1198,7 +1309,7 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
assert(anchor + llen <= iend);
ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH);
+ ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
anchor += advance;
ip = anchor;
} }
@@ -1210,38 +1321,30 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
return (size_t)(iend - anchor);
}
+static size_t ZSTD_compressBlock_opt0(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+}
+
+static size_t ZSTD_compressBlock_opt2(
+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+ const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+ return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+}
size_t ZSTD_compressBlock_btopt(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
DEBUGLOG(5, "ZSTD_compressBlock_btopt");
- return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
+ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
}
-/* used in 2-pass strategy */
-static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus)
-{
- U32 s, sum=0;
- assert(ZSTD_FREQ_DIV+bonus >= 0);
- for (s=0; s<lastEltIndex+1; s++) {
- table[s] <<= ZSTD_FREQ_DIV+bonus;
- table[s]--;
- sum += table[s];
- }
- return sum;
-}
-/* used in 2-pass strategy */
-MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr)
-{
- if (ZSTD_compressedLiterals(optPtr))
- optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0);
- optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0);
- optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0);
- optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0);
-}
/* ZSTD_initStats_ultra():
* make a first compression pass, just to seed stats with more accurate starting values.
@@ -1263,7 +1366,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */
assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */
- ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/
+ ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/
/* invalidate first scan from history */
ZSTD_resetSeqStore(seqStore);
@@ -1272,8 +1375,6 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
ms->window.lowLimit = ms->window.dictLimit;
ms->nextToUpdate = ms->window.dictLimit;
- /* re-inforce weight of collected statistics */
- ZSTD_upscaleStats(&ms->opt);
}
size_t ZSTD_compressBlock_btultra(
@@ -1281,7 +1382,7 @@ size_t ZSTD_compressBlock_btultra(
const void* src, size_t srcSize)
{
DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
- return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
}
size_t ZSTD_compressBlock_btultra2(
@@ -1309,35 +1410,35 @@ size_t ZSTD_compressBlock_btultra2(
ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
}
- return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
}
size_t ZSTD_compressBlock_btopt_dictMatchState(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
+ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
}
size_t ZSTD_compressBlock_btultra_dictMatchState(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
+ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
}
size_t ZSTD_compressBlock_btopt_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
+ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
}
size_t ZSTD_compressBlock_btultra_extDict(
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
+ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
}
/* note : no btultra2 variant for extDict nor dictMatchState,
diff --git a/thirdparty/zstd/compress/zstdmt_compress.c b/thirdparty/zstd/compress/zstdmt_compress.c
index 22aa3e1245..6bc14b035e 100644
--- a/thirdparty/zstd/compress/zstdmt_compress.c
+++ b/thirdparty/zstd/compress/zstdmt_compress.c
@@ -102,9 +102,8 @@ typedef struct ZSTDMT_bufferPool_s {
buffer_t bTable[1]; /* variable size */
} ZSTDMT_bufferPool;
-static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned nbWorkers, ZSTD_customMem cMem)
+static ZSTDMT_bufferPool* ZSTDMT_createBufferPool(unsigned maxNbBuffers, ZSTD_customMem cMem)
{
- unsigned const maxNbBuffers = 2*nbWorkers + 3;
ZSTDMT_bufferPool* const bufPool = (ZSTDMT_bufferPool*)ZSTD_customCalloc(
sizeof(ZSTDMT_bufferPool) + (maxNbBuffers-1) * sizeof(buffer_t), cMem);
if (bufPool==NULL) return NULL;
@@ -160,9 +159,8 @@ static void ZSTDMT_setBufferSize(ZSTDMT_bufferPool* const bufPool, size_t const
}
-static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, U32 nbWorkers)
+static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool, unsigned maxNbBuffers)
{
- unsigned const maxNbBuffers = 2*nbWorkers + 3;
if (srcBufPool==NULL) return NULL;
if (srcBufPool->totalBuffers >= maxNbBuffers) /* good enough */
return srcBufPool;
@@ -171,7 +169,7 @@ static ZSTDMT_bufferPool* ZSTDMT_expandBufferPool(ZSTDMT_bufferPool* srcBufPool,
size_t const bSize = srcBufPool->bufferSize; /* forward parameters */
ZSTDMT_bufferPool* newBufPool;
ZSTDMT_freeBufferPool(srcBufPool);
- newBufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+ newBufPool = ZSTDMT_createBufferPool(maxNbBuffers, cMem);
if (newBufPool==NULL) return newBufPool;
ZSTDMT_setBufferSize(newBufPool, bSize);
return newBufPool;
@@ -263,6 +261,16 @@ static void ZSTDMT_releaseBuffer(ZSTDMT_bufferPool* bufPool, buffer_t buf)
ZSTD_customFree(buf.start, bufPool->cMem);
}
+/* We need 2 output buffers per worker since each dstBuff must be flushed after it is released.
+ * The 3 additional buffers are as follows:
+ * 1 buffer for input loading
+ * 1 buffer for "next input" when submitting current one
+ * 1 buffer stuck in queue */
+#define BUF_POOL_MAX_NB_BUFFERS(nbWorkers) 2*nbWorkers + 3
+
+/* After a worker releases its rawSeqStore, it is immediately ready for reuse.
+ * So we only need one seq buffer per worker. */
+#define SEQ_POOL_MAX_NB_BUFFERS(nbWorkers) nbWorkers
/* ===== Seq Pool Wrapper ====== */
@@ -316,7 +324,7 @@ static void ZSTDMT_setNbSeq(ZSTDMT_seqPool* const seqPool, size_t const nbSeq)
static ZSTDMT_seqPool* ZSTDMT_createSeqPool(unsigned nbWorkers, ZSTD_customMem cMem)
{
- ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+ ZSTDMT_seqPool* const seqPool = ZSTDMT_createBufferPool(SEQ_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
if (seqPool == NULL) return NULL;
ZSTDMT_setNbSeq(seqPool, 0);
return seqPool;
@@ -329,7 +337,7 @@ static void ZSTDMT_freeSeqPool(ZSTDMT_seqPool* seqPool)
static ZSTDMT_seqPool* ZSTDMT_expandSeqPool(ZSTDMT_seqPool* pool, U32 nbWorkers)
{
- return ZSTDMT_expandBufferPool(pool, nbWorkers);
+ return ZSTDMT_expandBufferPool(pool, SEQ_POOL_MAX_NB_BUFFERS(nbWorkers));
}
@@ -467,7 +475,7 @@ ZSTDMT_serialState_reset(serialState_t* serialState,
ZSTD_dictContentType_e dictContentType)
{
/* Adjust parameters */
- if (params.ldmParams.enableLdm) {
+ if (params.ldmParams.enableLdm == ZSTD_ps_enable) {
DEBUGLOG(4, "LDM window size = %u KB", (1U << params.cParams.windowLog) >> 10);
ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
@@ -478,7 +486,7 @@ ZSTDMT_serialState_reset(serialState_t* serialState,
serialState->nextJobID = 0;
if (params.fParams.checksumFlag)
XXH64_reset(&serialState->xxhState, 0);
- if (params.ldmParams.enableLdm) {
+ if (params.ldmParams.enableLdm == ZSTD_ps_enable) {
ZSTD_customMem cMem = params.customMem;
unsigned const hashLog = params.ldmParams.hashLog;
size_t const hashSize = ((size_t)1 << hashLog) * sizeof(ldmEntry_t);
@@ -564,7 +572,7 @@ static void ZSTDMT_serialState_update(serialState_t* serialState,
/* A future job may error and skip our job */
if (serialState->nextJobID == jobID) {
/* It is now our turn, do any processing necessary */
- if (serialState->params.ldmParams.enableLdm) {
+ if (serialState->params.ldmParams.enableLdm == ZSTD_ps_enable) {
size_t error;
assert(seqStore.seq != NULL && seqStore.pos == 0 &&
seqStore.size == 0 && seqStore.capacity > 0);
@@ -594,7 +602,7 @@ static void ZSTDMT_serialState_update(serialState_t* serialState,
if (seqStore.size > 0) {
size_t const err = ZSTD_referenceExternalSequences(
jobCCtx, seqStore.seq, seqStore.size);
- assert(serialState->params.ldmParams.enableLdm);
+ assert(serialState->params.ldmParams.enableLdm == ZSTD_ps_enable);
assert(!ZSTD_isError(err));
(void)err;
}
@@ -672,7 +680,7 @@ static void ZSTDMT_compressionJob(void* jobDescription)
if (dstBuff.start==NULL) JOB_ERROR(ERROR(memory_allocation));
job->dstBuff = dstBuff; /* this value can be read in ZSTDMT_flush, when it copies the whole job */
}
- if (jobParams.ldmParams.enableLdm && rawSeqStore.seq == NULL)
+ if (jobParams.ldmParams.enableLdm == ZSTD_ps_enable && rawSeqStore.seq == NULL)
JOB_ERROR(ERROR(memory_allocation));
/* Don't compute the checksum for chunks, since we compute it externally,
@@ -680,7 +688,7 @@ static void ZSTDMT_compressionJob(void* jobDescription)
*/
if (job->jobID != 0) jobParams.fParams.checksumFlag = 0;
/* Don't run LDM for the chunks, since we handle it externally */
- jobParams.ldmParams.enableLdm = 0;
+ jobParams.ldmParams.enableLdm = ZSTD_ps_disable;
/* Correct nbWorkers to 0. */
jobParams.nbWorkers = 0;
@@ -807,6 +815,15 @@ typedef struct {
static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
#define RSYNC_LENGTH 32
+/* Don't create chunks smaller than the zstd block size.
+ * This stops us from regressing compression ratio too much,
+ * and ensures our output fits in ZSTD_compressBound().
+ *
+ * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then
+ * ZSTD_COMPRESSBOUND() will need to be updated.
+ */
+#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX
+#define RSYNC_MIN_BLOCK_SIZE (1<<RSYNC_MIN_BLOCK_LOG)
typedef struct {
U64 hash;
@@ -927,7 +944,7 @@ MEM_STATIC ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced_internal(unsigned nbWorkers,
mtctx->jobs = ZSTDMT_createJobsTable(&nbJobs, cMem);
assert(nbJobs > 0); assert((nbJobs & (nbJobs - 1)) == 0); /* ensure nbJobs is a power of 2 */
mtctx->jobIDMask = nbJobs - 1;
- mtctx->bufPool = ZSTDMT_createBufferPool(nbWorkers, cMem);
+ mtctx->bufPool = ZSTDMT_createBufferPool(BUF_POOL_MAX_NB_BUFFERS(nbWorkers), cMem);
mtctx->cctxPool = ZSTDMT_createCCtxPool(nbWorkers, cMem);
mtctx->seqPool = ZSTDMT_createSeqPool(nbWorkers, cMem);
initError = ZSTDMT_serialState_init(&mtctx->serial);
@@ -1030,7 +1047,7 @@ static size_t ZSTDMT_resize(ZSTDMT_CCtx* mtctx, unsigned nbWorkers)
{
if (POOL_resize(mtctx->factory, nbWorkers)) return ERROR(memory_allocation);
FORWARD_IF_ERROR( ZSTDMT_expandJobsTable(mtctx, nbWorkers) , "");
- mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, nbWorkers);
+ mtctx->bufPool = ZSTDMT_expandBufferPool(mtctx->bufPool, BUF_POOL_MAX_NB_BUFFERS(nbWorkers));
if (mtctx->bufPool == NULL) return ERROR(memory_allocation);
mtctx->cctxPool = ZSTDMT_expandCCtxPool(mtctx->cctxPool, nbWorkers);
if (mtctx->cctxPool == NULL) return ERROR(memory_allocation);
@@ -1135,7 +1152,7 @@ size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx)
static unsigned ZSTDMT_computeTargetJobLog(const ZSTD_CCtx_params* params)
{
unsigned jobLog;
- if (params->ldmParams.enableLdm) {
+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
/* In Long Range Mode, the windowLog is typically oversized.
* In which case, it's preferable to determine the jobSize
* based on cycleLog instead. */
@@ -1179,7 +1196,7 @@ static size_t ZSTDMT_computeOverlapSize(const ZSTD_CCtx_params* params)
int const overlapRLog = 9 - ZSTDMT_overlapLog(params->overlapLog, params->cParams.strategy);
int ovLog = (overlapRLog >= 8) ? 0 : (params->cParams.windowLog - overlapRLog);
assert(0 <= overlapRLog && overlapRLog <= 8);
- if (params->ldmParams.enableLdm) {
+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
/* In Long Range Mode, the windowLog is typically oversized.
* In which case, it's preferable to determine the jobSize
* based on chainLog instead.
@@ -1252,6 +1269,9 @@ size_t ZSTDMT_initCStream_internal(
/* Aim for the targetsectionSize as the average job size. */
U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10);
U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10);
+ /* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our
+ * expected job size is at least 4x larger. */
+ assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2);
DEBUGLOG(4, "rsyncLog = %u", rsyncBits);
mtctx->rsync.hash = 0;
mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1;
@@ -1263,7 +1283,7 @@ size_t ZSTDMT_initCStream_internal(
ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
{
/* If ldm is enabled we need windowSize space. */
- size_t const windowSize = mtctx->params.ldmParams.enableLdm ? (1U << mtctx->params.cParams.windowLog) : 0;
+ size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0;
/* Two buffers of slack, plus extra space for the overlap
* This is the minimum slack that LDM works with. One extra because
* flush might waste up to targetSectionSize-1 bytes. Another extra
@@ -1538,17 +1558,21 @@ static range_t ZSTDMT_getInputDataInUse(ZSTDMT_CCtx* mtctx)
static int ZSTDMT_isOverlapped(buffer_t buffer, range_t range)
{
BYTE const* const bufferStart = (BYTE const*)buffer.start;
- BYTE const* const bufferEnd = bufferStart + buffer.capacity;
BYTE const* const rangeStart = (BYTE const*)range.start;
- BYTE const* const rangeEnd = range.size != 0 ? rangeStart + range.size : rangeStart;
if (rangeStart == NULL || bufferStart == NULL)
return 0;
- /* Empty ranges cannot overlap */
- if (bufferStart == bufferEnd || rangeStart == rangeEnd)
- return 0;
- return bufferStart < rangeEnd && rangeStart < bufferEnd;
+ {
+ BYTE const* const bufferEnd = bufferStart + buffer.capacity;
+ BYTE const* const rangeEnd = rangeStart + range.size;
+
+ /* Empty ranges cannot overlap */
+ if (bufferStart == bufferEnd || rangeStart == rangeEnd)
+ return 0;
+
+ return bufferStart < rangeEnd && rangeStart < bufferEnd;
+ }
}
static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window)
@@ -1575,7 +1599,7 @@ static int ZSTDMT_doesOverlapWindow(buffer_t buffer, ZSTD_window_t window)
static void ZSTDMT_waitForLdmComplete(ZSTDMT_CCtx* mtctx, buffer_t buffer)
{
- if (mtctx->params.ldmParams.enableLdm) {
+ if (mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable) {
ZSTD_pthread_mutex_t* mutex = &mtctx->serial.ldmWindowMutex;
DEBUGLOG(5, "ZSTDMT_waitForLdmComplete");
DEBUGLOG(5, "source [0x%zx, 0x%zx)",
@@ -1678,6 +1702,11 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
if (!mtctx->params.rsyncable)
/* Rsync is disabled. */
return syncPoint;
+ if (mtctx->inBuff.filled + input.size - input.pos < RSYNC_MIN_BLOCK_SIZE)
+ /* We don't emit synchronization points if it would produce too small blocks.
+ * We don't have enough input to find a synchronization point, so don't look.
+ */
+ return syncPoint;
if (mtctx->inBuff.filled + syncPoint.toLoad < RSYNC_LENGTH)
/* Not enough to compute the hash.
* We will miss any synchronization points in this RSYNC_LENGTH byte
@@ -1688,10 +1717,28 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
*/
return syncPoint;
/* Initialize the loop variables. */
- if (mtctx->inBuff.filled >= RSYNC_LENGTH) {
- /* We have enough bytes buffered to initialize the hash.
+ if (mtctx->inBuff.filled < RSYNC_MIN_BLOCK_SIZE) {
+ /* We don't need to scan the first RSYNC_MIN_BLOCK_SIZE positions
+ * because they can't possibly be a sync point. So we can start
+ * part way through the input buffer.
+ */
+ pos = RSYNC_MIN_BLOCK_SIZE - mtctx->inBuff.filled;
+ if (pos >= RSYNC_LENGTH) {
+ prev = istart + pos - RSYNC_LENGTH;
+ hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
+ } else {
+ assert(mtctx->inBuff.filled >= RSYNC_LENGTH);
+ prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
+ hash = ZSTD_rollingHash_compute(prev + pos, (RSYNC_LENGTH - pos));
+ hash = ZSTD_rollingHash_append(hash, istart, pos);
+ }
+ } else {
+ /* We have enough bytes buffered to initialize the hash,
+ * and are have processed enough bytes to find a sync point.
* Start scanning at the beginning of the input.
*/
+ assert(mtctx->inBuff.filled >= RSYNC_MIN_BLOCK_SIZE);
+ assert(RSYNC_MIN_BLOCK_SIZE >= RSYNC_LENGTH);
pos = 0;
prev = (BYTE const*)mtctx->inBuff.buffer.start + mtctx->inBuff.filled - RSYNC_LENGTH;
hash = ZSTD_rollingHash_compute(prev, RSYNC_LENGTH);
@@ -1705,16 +1752,6 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
syncPoint.flush = 1;
return syncPoint;
}
- } else {
- /* We don't have enough bytes buffered to initialize the hash, but
- * we know we have at least RSYNC_LENGTH bytes total.
- * Start scanning after the first RSYNC_LENGTH bytes less the bytes
- * already buffered.
- */
- pos = RSYNC_LENGTH - mtctx->inBuff.filled;
- prev = (BYTE const*)mtctx->inBuff.buffer.start - pos;
- hash = ZSTD_rollingHash_compute(mtctx->inBuff.buffer.start, mtctx->inBuff.filled);
- hash = ZSTD_rollingHash_append(hash, istart, pos);
}
/* Starting with the hash of the previous RSYNC_LENGTH bytes, roll
* through the input. If we hit a synchronization point, then cut the
@@ -1726,8 +1763,9 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
*/
for (; pos < syncPoint.toLoad; ++pos) {
BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH];
- /* if (pos >= RSYNC_LENGTH) assert(ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash); */
+ assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower);
+ assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE);
if ((hash & hitMask) == hitMask) {
syncPoint.toLoad = pos + 1;
syncPoint.flush = 1;
diff --git a/thirdparty/zstd/compress/zstdmt_compress.h b/thirdparty/zstd/compress/zstdmt_compress.h
index 2fee2ec745..271eb1ac71 100644
--- a/thirdparty/zstd/compress/zstdmt_compress.h
+++ b/thirdparty/zstd/compress/zstdmt_compress.h
@@ -65,8 +65,11 @@ size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx);
* Private use only. Init streaming operation.
* expects params to be valid.
* must receive dict, or cdict, or none, but not both.
+ * mtctx can be freshly constructed or reused from a prior compression.
+ * If mtctx is reused, memory allocations from the prior compression may not be freed,
+ * even if they are not needed for the current compression.
* @return : 0, or an error code */
-size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* mtctx,
const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
const ZSTD_CDict* cdict,
ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);