diff options
Diffstat (limited to 'thirdparty/zstd/decompress')
-rw-r--r-- | thirdparty/zstd/decompress/huf_decompress.c | 916 | ||||
-rw-r--r-- | thirdparty/zstd/decompress/huf_decompress_amd64.S | 585 | ||||
-rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress.c | 107 | ||||
-rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress_block.c | 964 | ||||
-rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress_block.h | 10 | ||||
-rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress_internal.h | 37 |
6 files changed, 2183 insertions, 436 deletions
diff --git a/thirdparty/zstd/decompress/huf_decompress.c b/thirdparty/zstd/decompress/huf_decompress.c index b93c9a003b..2027188255 100644 --- a/thirdparty/zstd/decompress/huf_decompress.c +++ b/thirdparty/zstd/decompress/huf_decompress.c @@ -22,6 +22,13 @@ #define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" #include "../common/error_private.h" +#include "../common/zstd_internal.h" + +/* ************************************************************** +* Constants +****************************************************************/ + +#define HUF_DECODER_FAST_TABLELOG 11 /* ************************************************************** * Macros @@ -36,6 +43,30 @@ #error "Cannot force the use of the X1 and X2 decoders at the same time!" #endif +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE +#else +# define HUF_ASM_X86_64_BMI2_ATTRS +#endif + +#ifdef __cplusplus +# define HUF_EXTERN_C extern "C" +#else +# define HUF_EXTERN_C +#endif +#define HUF_ASM_DECL HUF_EXTERN_C + +#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_BMI2_FUNCTION 1 +#else +# define HUF_NEED_BMI2_FUNCTION 0 +#endif + +#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_DEFAULT_FUNCTION 1 +#else +# define HUF_NEED_DEFAULT_FUNCTION 0 +#endif /* ************************************************************** * Error Management @@ -65,7 +96,7 @@ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ } \ \ - static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \ + static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( \ void* dst, size_t dstSize, \ const void* cSrc, size_t cSrcSize, \ const HUF_DTable* DTable) \ @@ -107,13 +138,147 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) return dtd; } +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +static size_t HUF_initDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; + size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); + return value << bitsConsumed; +} +typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; + BYTE const* ilimit; + BYTE* oend; + BYTE const* iend[4]; +} HUF_DecompressAsmArgs; + +/** + * Initializes args for the asm decoding loop. + * @returns 0 on success + * 1 if the fallback implementation should be used. + * Or an error code on failure. + */ +static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + + const BYTE* const ilimit = (const BYTE*)src + 6 + 8; + + BYTE* const oend = (BYTE*)dst + dstSize; + + /* The following condition is false on x32 platform, + * but HUF_asm is not compatible with this ABI */ + if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) + return ERROR(corruption_detected); + + /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. + * If table log is not correct at this point, fallback to the old decoder. + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) + return 1; + + /* Read the jump table. */ + { + const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = srcSize - (length1 + length2 + length3 + 6); + args->iend[0] = istart + 6; /* jumpTable */ + args->iend[1] = args->iend[0] + length1; + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + + /* HUF_initDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + * length1 must be >= 16 so that ip[0] >= ilimit before the loop + * starts. + */ + if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) + return 1; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ + args->ip[0] = args->iend[1] - sizeof(U64); + args->ip[1] = args->iend[2] - sizeof(U64); + args->ip[2] = args->iend[3] - sizeof(U64); + args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); + + /* op[] contains the output pointers. */ + args->op[0] = (BYTE*)dst; + args->op[1] = args->op[0] + (dstSize+3)/4; + args->op[2] = args->op[1] + (dstSize+3)/4; + args->op[3] = args->op[2] + (dstSize+3)/4; + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) + return 1; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. + * It is shifted left as it is read, and zeros are + * shifted in. After the lowest valid bit a 1 is + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ + args->bits[0] = HUF_initDStream(args->ip[0]); + args->bits[1] = HUF_initDStream(args->ip[1]); + args->bits[2] = HUF_initDStream(args->ip[2]); + args->bits[3] = HUF_initDStream(args->ip[3]); + + /* If ip[] >= ilimit, it is guaranteed to be safe to + * reload bits[]. It may be beyond its section, but is + * guaranteed to be valid (>= istart). + */ + args->ilimit = ilimit; + + args->oend = oend; + args->dt = dt; + + return 0; +} + +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) +{ + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) + return ERROR(corruption_detected); + /* Validate that we haven't read beyond iend[]. + * Note that ip[] may be < iend[] because the MSB is + * the next bit to read, and we may have consumed 100% + * of the stream, so down to iend[i] - 8 is valid. + */ + if (args->ip[stream] < args->iend[stream] - 8) + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ + bit->bitContainer = MEM_readLE64(args->ip[stream]); + bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); + bit->start = (const char*)args->iend[0]; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; +} +#endif + #ifndef HUF_FORCE_DECOMPRESS_X2 /*-***************************/ /* single-symbol decoding */ /*-***************************/ -typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ +typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decoding */ /** * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at @@ -122,14 +287,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decodi static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { U64 D4; if (MEM_isLittleEndian()) { - D4 = symbol + (nbBits << 8); - } else { D4 = (symbol << 8) + nbBits; + } else { + D4 = symbol + (nbBits << 8); } D4 *= 0x0001000100010001ULL; return D4; } +/** + * Increase the tableLog to targetTableLog and rescales the stats. + * If tableLog > targetTableLog this is a no-op. + * @returns New tableLog + */ +static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) +{ + if (tableLog > targetTableLog) + return tableLog; + if (tableLog < targetTableLog) { + U32 const scale = targetTableLog - tableLog; + U32 s; + /* Increase the weight for all non-zero probability symbols by scale. */ + for (s = 0; s < nbSymbols; ++s) { + huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); + } + /* Update rankVal to reflect the new weights. + * All weights except 0 get moved to weight + scale. + * Weights [1, scale] are empty. + */ + for (s = targetTableLog; s > scale; --s) { + rankVal[s] = rankVal[s - scale]; + } + for (s = scale; s > 0; --s) { + rankVal[s] = 0; + } + } + return targetTableLog; +} + typedef struct { U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; @@ -162,8 +357,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); if (HUF_isError(iSize)) return iSize; + /* Table header */ { DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog + 1; + U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); + tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog); if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ dtd.tableType = 0; dtd.tableLog = (BYTE)tableLog; @@ -207,7 +406,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr /* fill DTable * We fill all entries of each weight in order. - * That way length is a constant for each iteration of the outter loop. + * That way length is a constant for each iteration of the outer loop. * We can switch based on the length to a different inner loop which is * optimized for that particular case. */ @@ -304,11 +503,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons BYTE* const pStart = p; /* up to 4 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_1(p, bitDPtr); - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + if ((pEnd - p) > 3) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + } + } else { + BIT_reloadDStream(bitDPtr); } /* [0-3] symbols remaining */ @@ -388,33 +591,36 @@ HUF_decompress4X1_usingDTable_internal_body( U32 endSignal = 1; if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ - for ( ; (endSignal) & (op4 < olimit) ; ) { - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_1(op1, &bitD1); - HUF_DECODE_SYMBOLX1_1(op2, &bitD2); - HUF_DECODE_SYMBOLX1_1(op3, &bitD3); - HUF_DECODE_SYMBOLX1_1(op4, &bitD4); - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_0(op1, &bitD1); - HUF_DECODE_SYMBOLX1_0(op2, &bitD2); - HUF_DECODE_SYMBOLX1_0(op3, &bitD3); - HUF_DECODE_SYMBOLX1_0(op4, &bitD4); - endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit) ; ) { + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + } } /* check corruption */ @@ -440,6 +646,79 @@ HUF_decompress4X1_usingDTable_internal_body( } } +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS +size_t +HUF_decompress4X1_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret != 0) + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >= args.ilimit); + HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); + + /* Our loop guarantees that ip[] >= ilimit and that we haven't + * overwritten any op[]. + */ + assert(args.ip[0] >= iend); + assert(args.ip[1] >= iend); + assert(args.ip[2] >= iend); + assert(args.ip[3] >= iend); + assert(args.op[3] <= oend); + (void)iend; + + /* finish bit streams one by one. */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + /* Decompress and validate that we've produced exactly the expected length. */ + args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const void *cSrc, @@ -447,8 +726,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const HUF_DTable *DTable); HUF_DGEN(HUF_decompress1X1_usingDTable_internal) -HUF_DGEN(HUF_decompress4X1_usingDTable_internal) +static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +} size_t HUF_decompress1X1_usingDTable( @@ -518,106 +817,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, /* *************************/ typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ -typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef struct { BYTE symbol; } sortedSymbol_t; typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; +/** + * Constructs a HUF_DEltX2 in a U32. + */ +static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ + U32 seq; + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); + if (MEM_isLittleEndian()) { + seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); + return seq + (nbBits << 16) + ((U32)level << 24); + } else { + seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); + return (seq << 16) + (nbBits << 8) + (U32)level; + } +} -/* HUF_fillDTableX2Level2() : - * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ -static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, - const U32* rankValOrigin, const int minWeight, - const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, - U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize) +/** + * Constructs a HUF_DEltX2. + */ +static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level) { HUF_DEltX2 DElt; - U32* rankVal = wksp; + U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); + ZSTD_memcpy(&DElt, &val, sizeof(val)); + return DElt; +} - assert(wkspSize >= HUF_TABLELOG_MAX + 1); - (void)wkspSize; - /* get pre-calculated rankVal */ - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); +/** + * Constructs 2 HUF_DEltX2s and packs them into a U64. + */ +static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) +{ + U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + return (U64)DElt + ((U64)DElt << 32); +} - /* fill skipped values */ - if (minWeight>1) { - U32 i, skipSize = rankVal[minWeight]; - MEM_writeLE16(&(DElt.sequence), baseSeq); - DElt.nbBits = (BYTE)(consumed); - DElt.length = 1; - for (i = 0; i < skipSize; i++) - DTable[i] = DElt; +/** + * Fills the DTable rank with all the symbols from [begin, end) that are each + * nbBits long. + * + * @param DTableRank The start of the rank in the DTable. + * @param begin The first symbol to fill (inclusive). + * @param end The last symbol to fill (exclusive). + * @param nbBits Each symbol is nbBits long. + * @param tableLog The table log. + * @param baseSeq If level == 1 { 0 } else { the first level symbol } + * @param level The level in the table. Must be 1 or 2. + */ +static void HUF_fillDTableX2ForWeight( + HUF_DEltX2* DTableRank, + sortedSymbol_t const* begin, sortedSymbol_t const* end, + U32 nbBits, U32 tableLog, + U16 baseSeq, int const level) +{ + U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); + const sortedSymbol_t* ptr; + assert(level >= 1 && level <= 2); + switch (length) { + case 1: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + *DTableRank++ = DElt; + } + break; + case 2: + for (ptr = begin; ptr != end; ++ptr) { + HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); + DTableRank[0] = DElt; + DTableRank[1] = DElt; + DTableRank += 2; + } + break; + case 4: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + DTableRank += 4; + } + break; + case 8: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + DTableRank += 8; + } + break; + default: + for (ptr = begin; ptr != end; ++ptr) { + U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); + HUF_DEltX2* const DTableRankEnd = DTableRank + length; + for (; DTableRank != DTableRankEnd; DTableRank += 8) { + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + } + } + break; } +} - /* fill DTable */ - { U32 s; for (s=0; s<sortedListSize; s++) { /* note : sortedSymbols already skipped */ - const U32 symbol = sortedSymbols[s].symbol; - const U32 weight = sortedSymbols[s].weight; - const U32 nbBits = nbBitsBaseline - weight; - const U32 length = 1 << (sizeLog-nbBits); - const U32 start = rankVal[weight]; - U32 i = start; - const U32 end = start + length; - - MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8))); - DElt.nbBits = (BYTE)(nbBits + consumed); - DElt.length = 2; - do { DTable[i++] = DElt; } while (i<end); /* since length >= 1 */ +/* HUF_fillDTableX2Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, + const U32* rankVal, const int minWeight, const int maxWeight1, + const sortedSymbol_t* sortedSymbols, U32 const* rankStart, + U32 nbBitsBaseline, U16 baseSeq) +{ + /* Fill skipped values (all positions up to rankVal[minWeight]). + * These are positions only get a single symbol because the combined weight + * is too large. + */ + if (minWeight>1) { + U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); + U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); + int const skipSize = rankVal[minWeight]; + assert(length > 1); + assert((U32)skipSize < length); + switch (length) { + case 2: + assert(skipSize == 1); + ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); + break; + case 4: + assert(skipSize <= 4); + ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); + break; + default: + { + int i; + for (i = 0; i < skipSize; i += 8) { + ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); + } + } + } + } - rankVal[weight] += length; - } } + /* Fill each of the second level symbols by weight. */ + { + int w; + for (w = minWeight; w < maxWeight1; ++w) { + int const begin = rankStart[w]; + int const end = rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + U32 const totalBits = nbBits + consumedBits; + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedSymbols + begin, sortedSymbols + end, + totalBits, targetLog, + baseSeq, /* level */ 2); + } + } } - static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, - const sortedSymbol_t* sortedList, const U32 sortedListSize, + const sortedSymbol_t* sortedList, const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, - const U32 nbBitsBaseline, U32* wksp, size_t wkspSize) + const U32 nbBitsBaseline) { - U32* rankVal = wksp; + U32* const rankVal = rankValOrigin[0]; const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ const U32 minBits = nbBitsBaseline - maxWeight; - U32 s; - - assert(wkspSize >= HUF_TABLELOG_MAX + 1); - wksp += HUF_TABLELOG_MAX + 1; - wkspSize -= HUF_TABLELOG_MAX + 1; - - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); - - /* fill DTable */ - for (s=0; s<sortedListSize; s++) { - const U16 symbol = sortedList[s].symbol; - const U32 weight = sortedList[s].weight; - const U32 nbBits = nbBitsBaseline - weight; - const U32 start = rankVal[weight]; - const U32 length = 1 << (targetLog-nbBits); - - if (targetLog-nbBits >= minBits) { /* enough room for a second symbol */ - U32 sortedRank; + int w; + int const wEnd = (int)maxWeight + 1; + + /* Fill DTable in order of weight. */ + for (w = 1; w < wEnd; ++w) { + int const begin = (int)rankStart[w]; + int const end = (int)rankStart[w+1]; + U32 const nbBits = nbBitsBaseline - w; + + if (targetLog-nbBits >= minBits) { + /* Enough room for a second symbol. */ + int start = rankVal[w]; + U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */); int minWeight = nbBits + scaleLog; + int s; if (minWeight < 1) minWeight = 1; - sortedRank = rankStart[minWeight]; - HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, - rankValOrigin[nbBits], minWeight, - sortedList+sortedRank, sortedListSize-sortedRank, - nbBitsBaseline, symbol, wksp, wkspSize); + /* Fill the DTable for every symbol of weight w. + * These symbols get at least 1 second symbol. + */ + for (s = begin; s != end; ++s) { + HUF_fillDTableX2Level2( + DTable + start, targetLog, nbBits, + rankValOrigin[nbBits], minWeight, wEnd, + sortedList, rankStart, + nbBitsBaseline, sortedList[s].symbol); + start += length; + } } else { - HUF_DEltX2 DElt; - MEM_writeLE16(&(DElt.sequence), symbol); - DElt.nbBits = (BYTE)(nbBits); - DElt.length = 1; - { U32 const end = start + length; - U32 u; - for (u = start; u < end; u++) DTable[u] = DElt; - } } - rankVal[weight] += length; + /* Only a single symbol. */ + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedList + begin, sortedList + end, + nbBits, targetLog, + /* baseSeq */ 0, /* level */ 1); + } } } typedef struct { rankValCol_t rankVal[HUF_TABLELOG_MAX]; U32 rankStats[HUF_TABLELOG_MAX + 1]; - U32 rankStart0[HUF_TABLELOG_MAX + 2]; + U32 rankStart0[HUF_TABLELOG_MAX + 3]; sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; @@ -627,9 +1046,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) { - U32 tableLog, maxW, sizeOfSort, nbSymbols; + return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +} + +size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + U32 tableLog, maxW, nbSymbols; DTableDesc dtd = HUF_getDTableDesc(DTable); - U32 const maxTableLog = dtd.maxTableLog; + U32 maxTableLog = dtd.maxTableLog; size_t iSize; void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; @@ -647,11 +1073,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ - iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), /* bmi2 */ 0); + iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); if (HUF_isError(iSize)) return iSize; /* check result */ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG; /* find maxWeight */ for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ @@ -664,7 +1091,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, rankStart[w] = curr; } rankStart[0] = nextRankStart; /* put all 0w symbols at the end of sorted list*/ - sizeOfSort = nextRankStart; + rankStart[maxW+1] = nextRankStart; } /* sort symbols by weight */ @@ -673,7 +1100,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, U32 const w = wksp->weightList[s]; U32 const r = rankStart[w]++; wksp->sortedSymbol[r].symbol = (BYTE)s; - wksp->sortedSymbol[r].weight = (BYTE)w; } rankStart[0] = 0; /* forget 0w symbols; this is beginning of weight(1) */ } @@ -698,10 +1124,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, } } } } HUF_fillDTableX2(dt, maxTableLog, - wksp->sortedSymbol, sizeOfSort, + wksp->sortedSymbol, wksp->rankStart0, wksp->rankVal, maxW, - tableLog+1, - wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32)); + tableLog+1); dtd.tableLog = (BYTE)maxTableLog; dtd.tableType = 1; @@ -714,7 +1139,7 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - ZSTD_memcpy(op, dt+val, 2); + ZSTD_memcpy(op, &dt[val].sequence, 2); BIT_skipBits(DStream, dt[val].nbBits); return dt[val].length; } @@ -723,15 +1148,17 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) { size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ - ZSTD_memcpy(op, dt+val, 1); - if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); - else { + ZSTD_memcpy(op, &dt[val].sequence, 1); + if (dt[val].length==1) { + BIT_skipBits(DStream, dt[val].nbBits); + } else { if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { BIT_skipBits(DStream, dt[val].nbBits); if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); - } } + } + } return 1; } @@ -753,19 +1180,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, BYTE* const pStart = p; /* up to 8 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_1(p, bitDPtr); - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { + if (dtLog <= 11 && MEM_64bits()) { + /* up to 10 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } else { + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } + } else { + BIT_reloadDStream(bitDPtr); } /* closer to end : up to 2 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >= 2) { + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); - while (p <= pEnd-2) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + } if (p < pEnd) p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); @@ -799,7 +1244,6 @@ HUF_decompress1X2_usingDTable_internal_body( /* decoded size */ return dstSize; } - FORCE_INLINE_TEMPLATE size_t HUF_decompress4X2_usingDTable_internal_body( void* dst, size_t dstSize, @@ -841,57 +1285,60 @@ HUF_decompress4X2_usingDTable_internal_body( U32 const dtLog = dtd.tableLog; if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); /* 16-32 symbols per loop (4-8 symbols per stream) */ - for ( ; (endSignal) & (op4 < olimit); ) { + if ((size_t)(oend - op4) >= sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit); ) { #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); - endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; - endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; #else - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal = (U32)LIKELY( - (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) - & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) - & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) - & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal = (U32)LIKELY((U32) + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); #endif + } } /* check corruption */ @@ -915,8 +1362,99 @@ HUF_decompress4X2_usingDTable_internal_body( } } +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) { + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret != 0) + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >= args.ilimit); + HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >= iend); + assert(args.ip[1] >= iend); + assert(args.ip[2] >= iend); + assert(args.ip[3] >= iend); + assert(args.op[3] <= oend); + (void)iend; + + /* finish bitStreams one by one */ + { + size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <= (size_t)(oend - segmentEnd)) + segmentEnd += segmentSize; + else + segmentEnd = oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); + args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] != segmentEnd) + return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + +static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +} + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) -HUF_DGEN(HUF_decompress4X2_usingDTable_internal) size_t HUF_decompress1X2_usingDTable( void* dst, size_t dstSize, @@ -1025,25 +1563,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; -static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = +static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] = { /* single, double, quad */ - {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */ - {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */ - {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ - {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ - {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ - {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ - {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ - {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ - {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ - {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ - {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ - {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ - {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ - {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */ - {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */ - {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */ + {{0,0}, {1,1}}, /* Q==0 : impossible */ + {{0,0}, {1,1}}, /* Q==1 : impossible */ + {{ 150,216}, { 381,119}}, /* Q == 2 : 12-18% */ + {{ 170,205}, { 514,112}}, /* Q == 3 : 18-25% */ + {{ 177,199}, { 539,110}}, /* Q == 4 : 25-32% */ + {{ 197,194}, { 644,107}}, /* Q == 5 : 32-38% */ + {{ 221,192}, { 735,107}}, /* Q == 6 : 38-44% */ + {{ 256,189}, { 881,106}}, /* Q == 7 : 44-50% */ + {{ 359,188}, {1167,109}}, /* Q == 8 : 50-56% */ + {{ 582,187}, {1570,114}}, /* Q == 9 : 56-62% */ + {{ 688,187}, {1712,122}}, /* Q ==10 : 62-69% */ + {{ 825,186}, {1965,136}}, /* Q ==11 : 69-75% */ + {{ 976,185}, {2131,150}}, /* Q ==12 : 75-81% */ + {{1180,186}, {2070,175}}, /* Q ==13 : 81-87% */ + {{1377,185}, {1731,202}}, /* Q ==14 : 87-93% */ + {{1412,185}, {1695,202}}, /* Q ==15 : 93-99% */ }; #endif @@ -1070,7 +1608,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) U32 const D256 = (U32)(dstSize >> 8); U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); - DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + DTime1 += DTime1 >> 5; /* small advantage to algorithm using less memory, to reduce cache eviction */ return DTime1 < DTime0; } #endif diff --git a/thirdparty/zstd/decompress/huf_decompress_amd64.S b/thirdparty/zstd/decompress/huf_decompress_amd64.S new file mode 100644 index 0000000000..49589cb611 --- /dev/null +++ b/thirdparty/zstd/decompress/huf_decompress_amd64.S @@ -0,0 +1,585 @@ +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "../common/portability_macros.h" + +/* Stack marking + * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart + */ +#if defined(__ELF__) && defined(__GNUC__) +.section .note.GNU-stack,"",%progbits +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +/* Calling convention: + * + * %rdi contains the first argument: HUF_DecompressAsmArgs*. + * %rbp isn't maintained (no frame pointer). + * %rsp contains the stack pointer that grows down. + * No red-zone is assumed, only addresses >= %rsp are used. + * All register contents are preserved. + * + * TODO: Support Windows calling convention. + */ + +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) +.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.text + +/* Sets up register mappings for clarity. + * op[], bits[], dtable & ip[0] each get their own register. + * ip[1,2,3] & olimit alias var[]. + * %rax is a scratch register. + */ + +#define op0 rsi +#define op1 rbx +#define op2 rcx +#define op3 rdi + +#define ip0 r8 +#define ip1 r9 +#define ip2 r10 +#define ip3 r11 + +#define bits0 rbp +#define bits1 rdx +#define bits2 r12 +#define bits3 r13 +#define dtable r14 +#define olimit r15 + +/* var[] aliases ip[1,2,3] & olimit + * ip[1,2,3] are saved every iteration. + * olimit is only used in compute_olimit. + */ +#define var0 r15 +#define var1 r9 +#define var2 r10 +#define var3 r11 + +/* 32-bit var registers */ +#define vard0 r15d +#define vard1 r9d +#define vard2 r10d +#define vard3 r11d + +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ + X(0, idx); \ + X(1, idx); \ + X(2, idx); \ + X(3, idx) + +/* Define both _HUF_* & HUF_* symbols because MacOS + * C symbols are prefixed with '_' & Linux symbols aren't. + */ +_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + /* Read HUF_DecompressAsmArgs* args from %rax */ + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push 104(%rax) /* ilimit */ + push 112(%rax) /* oend */ + push %olimit /* olimit space */ + + subq $24, %rsp + +.L_4X1_compute_olimit: + /* Computes how many iterations we can do safely + * %r15, %rax may be clobbered + * rbx, rdx must be saved + * op3 & ip0 mustn't be clobbered + */ + movq %rbx, 0(%rsp) + movq %rdx, 8(%rsp) + + movq 32(%rsp), %rax /* rax = oend */ + subq %op3, %rax /* rax = oend - op3 */ + + /* r15 = (oend - op3) / 5 */ + movabsq $-3689348814741910323, %rdx + mulq %rdx + movq %rdx, %r15 + shrq $2, %r15 + + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilimit */ + subq %rdx, %rax /* rax = ip0 - ilimit */ + movq %rax, %rbx /* rbx = ip0 - ilimit */ + + /* rdx = (ip0 - ilimit) / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %rbx + shrq %rbx + addq %rbx, %rdx + shrq $2, %rdx + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + /* r15 = r15 * 5 */ + leaq (%r15, %r15, 4), %r15 + + /* olimit = op3 + r15 */ + addq %op3, %olimit + + movq 8(%rsp), %rdx + movq 0(%rsp), %rbx + + /* If (op3 + 20 > olimit) */ + movq %op3, %rax /* rax = op3 */ + addq $20, %rax /* rax = op3 + 20 */ + cmpq %rax, %olimit /* op3 + 20 > olimit */ + jb .L_4X1_exit + + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X1_exit + + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X1_exit + + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X1_exit + +/* Reads top 11 bits from bits[n] + * Loads dt[bits[n]] into var[n] + */ +#define GET_NEXT_DELT(n) \ + movq $53, %var##n; \ + shrxq %var##n, %bits##n, %var##n; \ + movzwl (%dtable,%var##n,2),%vard##n + +/* var[n] must contain the DTable entry computed with GET_NEXT_DELT + * Moves var[n] to %rax + * bits[n] <<= var[n] & 63 + * op[n][idx] = %rax >> 8 + * %ah is a way to access bits [8, 16) of %rax + */ +#define DECODE_FROM_DELT(n, idx) \ + movq %var##n, %rax; \ + shlxq %var##n, %bits##n, %bits##n; \ + movb %ah, idx(%op##n) + +/* Assumes GET_NEXT_DELT has been called. + * Calls DECODE_FROM_DELT then GET_NEXT_DELT + */ +#define DECODE_AND_GET_NEXT(n, idx) \ + DECODE_FROM_DELT(n, idx); \ + GET_NEXT_DELT(n) \ + +/* // ctz & nbBytes is stored in bits[n] + * // nbBits is stored in %rax + * ctz = CTZ[bits[n]] + * nbBits = ctz & 7 + * nbBytes = ctz >> 3 + * op[n] += 5 + * ip[n] -= nbBytes + * // Note: x86-64 is little-endian ==> no bswap + * bits[n] = MEM_readST(ip[n]) | 1 + * bits[n] <<= nbBits + */ +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + andq $7, %rax; \ + shrq $3, %bits##n; \ + leaq 5(%op##n), %op##n; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlx %rax, %bits##n, %bits##n + + /* Store clobbered variables on the stack */ + movq %olimit, 24(%rsp) + movq %ip1, 0(%rsp) + movq %ip2, 8(%rsp) + movq %ip3, 16(%rsp) + + /* Call GET_NEXT_DELT for each stream */ + FOR_EACH_STREAM(GET_NEXT_DELT) + + .p2align 6 + +.L_4X1_loop_body: + /* Decode 5 symbols in each of the 4 streams (20 total) + * Must have called GET_NEXT_DELT for each stream + */ + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) + + /* Load ip[1,2,3] from stack (var[] aliases them) + * ip[] is needed for RELOAD_BITS + * Each will be stored back to the stack after RELOAD + */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + /* Reload each stream & fetch the next table entry + * to prepare for the next iteration + */ + RELOAD_BITS(0) + GET_NEXT_DELT(0) + + RELOAD_BITS(1) + movq %ip1, 0(%rsp) + GET_NEXT_DELT(1) + + RELOAD_BITS(2) + movq %ip2, 8(%rsp) + GET_NEXT_DELT(2) + + RELOAD_BITS(3) + movq %ip3, 16(%rsp) + GET_NEXT_DELT(3) + + /* If op3 < olimit: continue the loop */ + cmp %op3, 24(%rsp) + ja .L_4X1_loop_body + + /* Reload ip[1,2,3] from stack */ + movq 0(%rsp), %ip1 + movq 8(%rsp), %ip2 + movq 16(%rsp), %ip3 + + /* Re-compute olimit */ + jmp .L_4X1_compute_olimit + +#undef GET_NEXT_DELT +#undef DECODE_FROM_DELT +#undef DECODE +#undef RELOAD_BITS +.L_4X1_exit: + addq $24, %rsp + + /* Restore stack (oend & olimit) */ + pop %rax /* olimit */ + pop %rax /* oend */ + pop %rax /* ilimit */ + pop %rax /* arg */ + + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret + +_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: + /* Save all registers - even if they are callee saved for simplicity. */ + push %rax + push %rbx + push %rcx + push %rdx + push %rbp + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + push %r12 + push %r13 + push %r14 + push %r15 + + movq %rdi, %rax + movq 0(%rax), %ip0 + movq 8(%rax), %ip1 + movq 16(%rax), %ip2 + movq 24(%rax), %ip3 + movq 32(%rax), %op0 + movq 40(%rax), %op1 + movq 48(%rax), %op2 + movq 56(%rax), %op3 + movq 64(%rax), %bits0 + movq 72(%rax), %bits1 + movq 80(%rax), %bits2 + movq 88(%rax), %bits3 + movq 96(%rax), %dtable + push %rax /* argument */ + push %rax /* olimit */ + push 104(%rax) /* ilimit */ + + movq 112(%rax), %rax + push %rax /* oend3 */ + + movq %op3, %rax + push %rax /* oend2 */ + + movq %op2, %rax + push %rax /* oend1 */ + + movq %op1, %rax + push %rax /* oend0 */ + + /* Scratch space */ + subq $8, %rsp + +.L_4X2_compute_olimit: + /* Computes how many iterations we can do safely + * %r15, %rax may be clobbered + * rdx must be saved + * op[1,2,3,4] & ip0 mustn't be clobbered + */ + movq %rdx, 0(%rsp) + + /* We can consume up to 7 input bytes each iteration. */ + movq %ip0, %rax /* rax = ip0 */ + movq 40(%rsp), %rdx /* rdx = ilimit */ + subq %rdx, %rax /* rax = ip0 - ilimit */ + movq %rax, %r15 /* r15 = ip0 - ilimit */ + + /* rdx = rax / 7 */ + movabsq $2635249153387078803, %rdx + mulq %rdx + subq %rdx, %r15 + shrq %r15 + addq %r15, %rdx + shrq $2, %rdx + + /* r15 = (ip0 - ilimit) / 7 */ + movq %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 8(%rsp), %rax /* rax = oend0 */ + subq %op0, %rax /* rax = oend0 - op0 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 16(%rsp), %rax /* rax = oend1 */ + subq %op1, %rax /* rax = oend1 - op1 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 24(%rsp), %rax /* rax = oend2 */ + subq %op2, %rax /* rax = oend2 - op2 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + movabsq $-3689348814741910323, %rdx + movq 32(%rsp), %rax /* rax = oend3 */ + subq %op3, %rax /* rax = oend3 - op3 */ + mulq %rdx + shrq $3, %rdx /* rdx = rax / 10 */ + + /* r15 = min(%rdx, %r15) */ + cmpq %rdx, %r15 + cmova %rdx, %r15 + + /* olimit = op3 + 5 * r15 */ + movq %r15, %rax + leaq (%op3, %rax, 4), %olimit + addq %rax, %olimit + + movq 0(%rsp), %rdx + + /* If (op3 + 10 > olimit) */ + movq %op3, %rax /* rax = op3 */ + addq $10, %rax /* rax = op3 + 10 */ + cmpq %rax, %olimit /* op3 + 10 > olimit */ + jb .L_4X2_exit + + /* If (ip1 < ip0) go to exit */ + cmpq %ip0, %ip1 + jb .L_4X2_exit + + /* If (ip2 < ip1) go to exit */ + cmpq %ip1, %ip2 + jb .L_4X2_exit + + /* If (ip3 < ip2) go to exit */ + cmpq %ip2, %ip3 + jb .L_4X2_exit + +#define DECODE(n, idx) \ + movq %bits##n, %rax; \ + shrq $53, %rax; \ + movzwl 0(%dtable,%rax,4),%r8d; \ + movzbl 2(%dtable,%rax,4),%r15d; \ + movzbl 3(%dtable,%rax,4),%eax; \ + movw %r8w, (%op##n); \ + shlxq %r15, %bits##n, %bits##n; \ + addq %rax, %op##n + +#define RELOAD_BITS(n) \ + bsfq %bits##n, %bits##n; \ + movq %bits##n, %rax; \ + shrq $3, %bits##n; \ + andq $7, %rax; \ + subq %bits##n, %ip##n; \ + movq (%ip##n), %bits##n; \ + orq $1, %bits##n; \ + shlxq %rax, %bits##n, %bits##n + + + movq %olimit, 48(%rsp) + + .p2align 6 + +.L_4X2_loop_body: + /* We clobber r8, so store it on the stack */ + movq %r8, 0(%rsp) + + /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ + FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) + FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + + /* Reload r8 */ + movq 0(%rsp), %r8 + + FOR_EACH_STREAM(RELOAD_BITS) + + cmp %op3, 48(%rsp) + ja .L_4X2_loop_body + jmp .L_4X2_compute_olimit + +#undef DECODE +#undef RELOAD_BITS +.L_4X2_exit: + addq $8, %rsp + /* Restore stack (oend & olimit) */ + pop %rax /* oend0 */ + pop %rax /* oend1 */ + pop %rax /* oend2 */ + pop %rax /* oend3 */ + pop %rax /* ilimit */ + pop %rax /* olimit */ + pop %rax /* arg */ + + /* Save ip / op / bits */ + movq %ip0, 0(%rax) + movq %ip1, 8(%rax) + movq %ip2, 16(%rax) + movq %ip3, 24(%rax) + movq %op0, 32(%rax) + movq %op1, 40(%rax) + movq %op2, 48(%rax) + movq %op3, 56(%rax) + movq %bits0, 64(%rax) + movq %bits1, 72(%rax) + movq %bits2, 80(%rax) + movq %bits3, 88(%rax) + + /* Restore registers */ + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rbp + pop %rdx + pop %rcx + pop %rbx + pop %rax + ret + +#endif diff --git a/thirdparty/zstd/decompress/zstd_decompress.c b/thirdparty/zstd/decompress/zstd_decompress.c index 910bc034c0..0031e98cfb 100644 --- a/thirdparty/zstd/decompress/zstd_decompress.c +++ b/thirdparty/zstd/decompress/zstd_decompress.c @@ -56,7 +56,6 @@ * Dependencies *********************************************************/ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ -#include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" @@ -177,12 +176,15 @@ static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet, static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) { ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem); DEBUGLOG(4, "Allocating new hash set"); + if (!ret) + return NULL; ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem); - ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; - ret->ddictPtrCount = 0; - if (!ret || !ret->ddictPtrTable) { + if (!ret->ddictPtrTable) { + ZSTD_customFree(ret, customMem); return NULL; } + ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; + ret->ddictPtrCount = 0; return ret; } @@ -255,11 +257,15 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) dctx->inBuffSize = 0; dctx->outBuffSize = 0; dctx->streamStage = zdss_init; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) dctx->legacyContext = NULL; dctx->previousLegacyVersion = 0; +#endif dctx->noForwardProgress = 0; dctx->oversizedDuration = 0; - dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); +#endif dctx->ddictSet = NULL; ZSTD_DCtx_resetParameters(dctx); #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION @@ -280,8 +286,7 @@ ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) return dctx; } -ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) -{ +static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem); @@ -292,10 +297,15 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) } } +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_internal(customMem); +} + ZSTD_DCtx* ZSTD_createDCtx(void) { DEBUGLOG(3, "ZSTD_createDCtx"); - return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } static void ZSTD_clearDict(ZSTD_DCtx* dctx) @@ -380,6 +390,19 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) return 0; } +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + */ +unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } + return 0; +} + /** ZSTD_frameHeaderSize_internal() : * srcSize must be large enough to reach header size fields. * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. @@ -466,7 +489,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s } switch(dictIDSizeCode) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : break; case 1 : dictID = ip[pos]; pos++; break; case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; @@ -474,7 +499,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s } switch(fcsID) { - default: assert(0); /* impossible */ + default: + assert(0); /* impossible */ + ZSTD_FALLTHROUGH; case 0 : if (singleSegment) frameContentSize = ip[pos]; break; case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break; case 2 : frameContentSize = MEM_readLE32(ip+pos); break; @@ -503,7 +530,6 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); } - /** ZSTD_getFrameContentSize() : * compatible with legacy mode * @return : decompressed size of the single frame pointed to be `src` if known, otherwise @@ -544,6 +570,37 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) } } +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, + const void* src, size_t srcSize) +{ + U32 const magicNumber = MEM_readLE32(src); + size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); + size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; + + /* check input validity */ + RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); + RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); + RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); + + /* deliver payload */ + if (skippableContentSize > 0 && dst != NULL) + ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); + if (magicVariant != NULL) + *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; + return skippableContentSize; +} + /** ZSTD_findDecompressedSize() : * compatible with legacy mode * `srcSize` must be the exact length of some number of ZSTD compressed and/or @@ -858,7 +915,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, switch(blockProperties.blockType) { case bt_compressed: - decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1); + decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming); break; case bt_raw : decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); @@ -1009,7 +1066,7 @@ static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx) switch (dctx->dictUses) { default: assert(0 /* Impossible */); - /* fall-through */ + ZSTD_FALLTHROUGH; case ZSTD_dont_use: ZSTD_clearDict(dctx); return NULL; @@ -1031,7 +1088,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr { #if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) size_t regenSize; - ZSTD_DCtx* const dctx = ZSTD_createDCtx(); + ZSTD_DCtx* const dctx = ZSTD_createDCtx_internal(ZSTD_defaultCMem); RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!"); regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize); ZSTD_freeDCtx(dctx); @@ -1065,7 +1122,7 @@ static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t return dctx->expected; if (dctx->bType != bt_raw) return dctx->expected; - return MIN(MAX(inputSize, 1), dctx->expected); + return BOUNDED(1, inputSize, dctx->expected); } ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { @@ -1073,7 +1130,9 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { { default: /* should not happen */ assert(0); + ZSTD_FALLTHROUGH; case ZSTDds_getFrameHeaderSize: + ZSTD_FALLTHROUGH; case ZSTDds_decodeFrameHeader: return ZSTDnit_frameHeader; case ZSTDds_decodeBlockHeader: @@ -1085,6 +1144,7 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { case ZSTDds_checkChecksum: return ZSTDnit_checksum; case ZSTDds_decodeSkippableHeader: + ZSTD_FALLTHROUGH; case ZSTDds_skipFrame: return ZSTDnit_skippableFrame; } @@ -1168,7 +1228,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c { case bt_compressed: DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); - rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); dctx->expected = 0; /* Streaming not supported */ break; case bt_raw : @@ -1493,7 +1553,7 @@ size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, ZSTD_DStream* ZSTD_createDStream(void) { DEBUGLOG(3, "ZSTD_createDStream"); - return ZSTD_createDStream_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) @@ -1503,7 +1563,7 @@ ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) { - return ZSTD_createDCtx_advanced(customMem); + return ZSTD_createDCtx_internal(customMem); } size_t ZSTD_freeDStream(ZSTD_DStream* zds) @@ -1763,7 +1823,8 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) { size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); - unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); + /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ + unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); size_t const minRBSize = (size_t) neededSize; RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, @@ -1897,10 +1958,12 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB DEBUGLOG(5, "stage zdss_init => transparent reset "); zds->streamStage = zdss_loadHeader; zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) zds->legacyVersion = 0; +#endif zds->hostageByte = 0; zds->expectedOutBuffer = *output; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_loadHeader : DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); @@ -2038,7 +2101,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB zds->outBuffSize = neededOutBuffSize; } } } zds->streamStage = zdss_read; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_read: DEBUGLOG(5, "stage zdss_read"); @@ -2057,7 +2120,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB } } if (ip==iend) { someMoreWork = 0; break; } /* no more input */ zds->streamStage = zdss_load; - /* fall-through */ + ZSTD_FALLTHROUGH; case zdss_load: { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); diff --git a/thirdparty/zstd/decompress/zstd_decompress_block.c b/thirdparty/zstd/decompress/zstd_decompress_block.c index 349dcdc333..2e44d30d2f 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_block.c +++ b/thirdparty/zstd/decompress/zstd_decompress_block.c @@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, } } +/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ +static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) +{ + if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) + { + /* room for litbuffer to fit without read faulting */ + dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; + } + else if (litSize > ZSTD_LITBUFFEREXTRASIZE) + { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; + } + else { + /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; + } + else + { + /* fits entirely within litExtraBuffer, so no split is necessary */ + dctx->litBuffer = dctx->litExtraBuffer; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; + } +} /* Hidden declaration for fullbench */ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize); + const void* src, size_t srcSize, + void* dst, size_t dstCapacity, const streaming_operation streaming); /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current + * block will be output. Otherwise it will be stored at the end of the current dst blockspace, with a small portion being + * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. + * * @return : nb of bytes read from src (< srcSize ) * note : symbol not declared but exposed for fullbench */ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) { DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); @@ -90,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, case set_repeat: DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); - /* fall-through */ + ZSTD_FALLTHROUGH; case set_compressed: RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); @@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, U32 const lhlCode = (istart[0] >> 2) & 3; U32 const lhc = MEM_readLE32(istart); size_t hufSuccess; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); switch(lhlCode) { case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); break; } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); /* prefetch huffman table if cold */ if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { @@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, if (singleStream) { hufSuccess = HUF_decompress1X_usingDTable_bmi2( dctx->litBuffer, litSize, istart+lhSize, litCSize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); } else { hufSuccess = HUF_decompress4X_usingDTable_bmi2( dctx->litBuffer, litSize, istart+lhSize, litCSize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); } } else { if (singleStream) { @@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); #endif } else { hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( dctx->entropy.hufTable, dctx->litBuffer, litSize, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); } } + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; + } RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); @@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, dctx->litSize = litSize; dctx->litEntropy = 1; if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return litCSize + lhSize; } case set_basic: { size_t litSize, lhSize; U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, break; } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); - ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); + } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); return lhSize+litSize; } /* direct reference into compressed stream */ dctx->litPtr = istart+lhSize; dctx->litSize = litSize; + dctx->litBufferEnd = dctx->litPtr + litSize; + dctx->litBufferLocation = ZSTD_not_in_dst; return lhSize+litSize; } case set_rle: { U32 const lhlCode = ((istart[0]) >> 2) & 3; size_t litSize, lhSize; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ @@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); break; } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); - ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); + } dctx->litPtr = dctx->litBuffer; dctx->litSize = litSize; return lhSize+1; @@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = { }; /* ML_defaultDTable */ -static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits) +static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits) { void* ptr = dt; ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; @@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB cell->nbBits = 0; cell->nextState = 0; assert(nbAddBits < 255); - cell->nbAdditionalBits = (BYTE)nbAddBits; + cell->nbAdditionalBits = nbAddBits; cell->baseValue = baseValue; } @@ -367,7 +443,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB FORCE_INLINE_TEMPLATE void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_seqSymbol* const tableDecode = dt+1; @@ -478,7 +554,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) ); tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize); assert(nbAdditionalBits[symbol] < 255); - tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol]; + tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol]; tableDecode[u].baseValue = baseValue[symbol]; } } @@ -487,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, /* Avoids the FORCE_INLINE of the _body() function. */ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -495,9 +571,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, } #if DYNAMIC_BMI2 -TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, +BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -507,7 +583,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) { #if DYNAMIC_BMI2 @@ -529,7 +605,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr, symbolEncodingType_e type, unsigned max, U32 maxLog, const void* src, size_t srcSize, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable, int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize, int bmi2) @@ -541,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""); { U32 const symbol = *(const BYTE*)src; U32 const baseline = baseValue[symbol]; - U32 const nbBits = nbAdditionalBits[symbol]; + U8 const nbBits = nbAdditionalBits[symbol]; ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); } *DTablePtr = DTableSpace; @@ -620,7 +696,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, LL_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += llhSize; } @@ -632,7 +708,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, OF_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += ofhSize; } @@ -644,7 +720,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, ML_defaultDTable, dctx->fseEntropy, dctx->ddictIsCold, nbSeq, dctx->workspace, sizeof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(dctx)); RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); ip += mlhSize; } @@ -713,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. * The src buffer must be before the dst buffer. */ -static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { +static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { ptrdiff_t const diff = op - ip; BYTE* const oend = op + length; @@ -729,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ assert(length >= 8); ZSTD_overlapCopy8(&op, &ip, diff); + length -= 8; assert(op - ip >= 8); assert(op <= oend); } @@ -743,12 +820,35 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ assert(oend > oend_w); ZSTD_wildcopy(op, ip, oend_w - op, ovtype); ip += oend_w - op; - op = oend_w; + op += oend_w - op; } /* Handle the leftovers. */ while (op < oend) *op++ = *ip++; } +/* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + + if (length < 8 || diff > -8) { + /* Handle short lengths, close overlaps, and dst not before src. */ + while (op < oend) *op++ = *ip++; + return; + } + + if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { + ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); + ip += oend - WILDCOPY_OVERLENGTH - op; + op += oend - WILDCOPY_OVERLENGTH - op; + } + + /* Handle the leftovers. */ + while (op < oend) *op++ = *ip++; +} + /* ZSTD_execSequenceEnd(): * This version handles cases that are near the end of the output buffer. It requires * more careful checks to make sure there is no overflow. By separating out these hard @@ -759,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_ */ FORCE_NOINLINE size_t ZSTD_execSequenceEnd(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; @@ -784,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op, if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); - match = dictEnd - (prefixStart-match); + match = dictEnd - (prefixStart - match); if (match + sequence.matchLength <= dictEnd) { ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 = dictEnd - match; - ZSTD_memmove(oLitEnd, match, length1); - op = oLitEnd + length1; - sequence.matchLength -= length1; - match = prefixStart; - } } + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); + return sequenceLength; +} + +/* ZSTD_execSequenceEndSplitLitBuffer(): + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ +FORCE_NOINLINE +size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); + ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); + op = oLitEnd; + *litPtr = iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart - match); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); return sequenceLength; } HINT_INLINE size_t ZSTD_execSequence(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) { BYTE* const oLitEnd = op + sequence.litLength; size_t const sequenceLength = sequence.litLength + sequence.matchLength; @@ -821,10 +970,102 @@ size_t ZSTD_execSequence(BYTE* op, * - 32-bit mode and the match length overflows */ if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); + assert(iLitEnd <= litLimit /* Literal length is in bounds */); + assert(oLitEnd <= oend_w /* Can wildcopy literals */); + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); + } + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); + match = dictEnd + (match - prefixStart); + if (match + sequence.matchLength <= dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } + } + /* Match within prefix of 1 or more bytes */ + assert(op <= oMatchEnd); + assert(oMatchEnd <= oend_w); + assert(match >= prefixStart); + assert(sequence.matchLength >= 1); + + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); + } + return sequenceLength; +} + +HINT_INLINE +size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( iLitEnd > litLimit || oMatchEnd > oend_w || (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) - return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ assert(op <= oLitEnd /* No overflow */); @@ -892,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op, return sequenceLength; } + static void ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) { @@ -905,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS } FORCE_INLINE_TEMPLATE void -ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) -{ - ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; - U32 const nbBits = DInfo.nbBits; - size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; -} - -FORCE_INLINE_TEMPLATE void -ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits) { - U32 const nbBits = DInfo.nbBits; size_t const lowBits = BIT_readBits(bitD, nbBits); - DStatePtr->state = DInfo.nextState + lowBits; + DStatePtr->state = nextState + lowBits; } /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum @@ -937,102 +1169,100 @@ FORCE_INLINE_TEMPLATE seq_t ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) { seq_t seq; - ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; - ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; - ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; - U32 const llBase = llDInfo.baseValue; - U32 const mlBase = mlDInfo.baseValue; - U32 const ofBase = ofDInfo.baseValue; - BYTE const llBits = llDInfo.nbAdditionalBits; - BYTE const mlBits = mlDInfo.nbAdditionalBits; - BYTE const ofBits = ofDInfo.nbAdditionalBits; - BYTE const totalBits = llBits+mlBits+ofBits; - - /* sequence */ - { size_t offset; - if (ofBits > 1) { - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); - assert(ofBits <= MaxOff); - if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { - U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - BIT_reloadDStream(&seqState->DStream); - if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); - assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ - } else { - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } else { - U32 const ll0 = (llBase == 0); - if (LIKELY((ofBits == 0))) { - if (LIKELY(!ll0)) - offset = seqState->prevOffset[0]; - else { - offset = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; + BYTE const llBits = llDInfo->nbAdditionalBits; + BYTE const mlBits = mlDInfo->nbAdditionalBits; + BYTE const ofBits = ofDInfo->nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + U16 const llNext = llDInfo->nextState; + U16 const mlNext = mlDInfo->nextState; + U16 const ofNext = ofDInfo->nextState; + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; + /* + * As gcc has better branch and block analyzers, sometimes it is only + * valuable to mark likelyness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; + #if defined(__clang__) + if (LIKELY(ofBits > 1)) { + #else + if (ofBits > 1) { + #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; } else { - offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); - { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } } } - seq.offset = offset; - } - - seq.matchLength = mlBase; - if (mlBits > 0) - seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); - - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) - BIT_reloadDStream(&seqState->DStream); - if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - - seq.litLength = llBase; - if (llBits > 0) - seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); - - if (MEM_32bits()) - BIT_reloadDStream(&seqState->DStream); - - DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - - /* ANS state update - * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). - * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). - * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the - * better option, so it is the default for other compilers. But, if you - * measure that it is worse, please put up a pull request. - */ - { -#if defined(__GNUC__) && !defined(__clang__) - const int kUseUpdateFseState = 1; -#else - const int kUseUpdateFseState = 0; -#endif - if (kUseUpdateFseState) { - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - } else { - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */ + U32 const ll0 = (llDInfo->baseValue == 0); + if (LIKELY((ofBits == 0))) { + offset = seqState->prevOffset[ll0]; + seqState->prevOffset[1] = seqState->prevOffset[!ll0]; + seqState->prevOffset[0] = offset; + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; } + + #if defined(__clang__) + if (UNLIKELY(mlBits > 0)) + #else + if (mlBits > 0) + #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + #if defined(__clang__) + if (UNLIKELY(llBits > 0)) + #else + if (llBits > 0) + #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ } return seq; @@ -1085,9 +1315,11 @@ MEM_STATIC void ZSTD_assertValidSequence( #endif #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + + FORCE_INLINE_TEMPLATE size_t DONT_VECTORIZE -ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, +ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset, @@ -1099,11 +1331,11 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, BYTE* const oend = ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - DEBUGLOG(5, "ZSTD_decompressSequences_body"); + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); (void)frame; /* Regen sequences */ @@ -1124,55 +1356,237 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, BIT_DStream_endOfBuffer < BIT_DStream_completed && BIT_DStream_completed < BIT_DStream_overflow); + /* decompress without overrunning litPtr begins */ + { + seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression + * speed swings based on the alignment of the decompression loop. This + * performance swing is caused by parts of the decompression loop falling + * out of the DSB. The entire decompression loop should fit in the DSB, + * when it can't we get much worse performance. You can measure if you've + * hit the good case or the bad case with this perf command for some + * compressed file test.zst: + * + * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ + * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst + * + * If you see most cycles served out of the MITE you've hit the bad case. + * If you see most cycles served out of the DSB you've hit the good case. + * If it is pretty even then you may be in an okay case. + * + * This issue has been reproduced on the following CPUs: + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 + * Use Instruments->Counters to get DSB/MITE cycles. + * I never got performance swings, but I was able to + * go from the good case of mostly DSB to half of the + * cycles served from MITE. + * - Coffeelake: Intel i9-9900k + * - Coffeelake: Intel i7-9700k + * + * I haven't been able to reproduce the instability or DSB misses on any + * of the following CPUS: + * - Haswell + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH + * - Skylake + * + * Alignment is done for each of the three major decompression loops: + * - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer + * - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer + * - ZSTD_decompressSequences_body + * Alignment choices are made to minimize large swings on bad cases and influence on performance + * from changes external to this code, rather than to overoptimize on the current commit. + * + * If you are seeing performance stability this script can help test. + * It tests on 4 commits in zstd where I saw performance change. + * + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 + */ #if defined(__GNUC__) && defined(__x86_64__) - /* Align the decompression loop to 32 + 16 bytes. - * - * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression - * speed swings based on the alignment of the decompression loop. This - * performance swing is caused by parts of the decompression loop falling - * out of the DSB. The entire decompression loop should fit in the DSB, - * when it can't we get much worse performance. You can measure if you've - * hit the good case or the bad case with this perf command for some - * compressed file test.zst: - * - * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ - * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst - * - * If you see most cycles served out of the MITE you've hit the bad case. - * If you see most cycles served out of the DSB you've hit the good case. - * If it is pretty even then you may be in an okay case. - * - * This issue has been reproduced on the following CPUs: - * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 - * Use Instruments->Counters to get DSB/MITE cycles. - * I never got performance swings, but I was able to - * go from the good case of mostly DSB to half of the - * cycles served from MITE. - * - Coffeelake: Intel i9-9900k - * - Coffeelake: Intel i7-9700k - * - * I haven't been able to reproduce the instability or DSB misses on any - * of the following CPUS: - * - Haswell - * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH - * - Skylake - * - * If you are seeing performance stability this script can help test. - * It tests on 4 commits in zstd where I saw performance change. - * - * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 - */ - __asm__(".p2align 6"); - __asm__("nop"); - __asm__(".p2align 5"); - __asm__("nop"); -# if __GNUC__ >= 9 - /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */ - __asm__(".p2align 3"); + __asm__(".p2align 6"); +# if __GNUC__ >= 7 + /* good for gcc-7, gcc-9, and gcc-11 */ + __asm__("nop"); + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 4"); +# if __GNUC__ == 8 || __GNUC__ == 10 + /* good for gcc-8 and gcc-10 */ + __asm__("nop"); + __asm__(".p2align 3"); +# endif +# endif +#endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ + for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { + size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + if (UNLIKELY(!--nbSeq)) + break; + BIT_reloadDStream(&(seqState.DStream)); + sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + } + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + if (--nbSeq) + BIT_reloadDStream(&(seqState.DStream)); + } + } + } + + if (nbSeq > 0) /* there is remaining lit from extra buffer */ + { + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ != 7 + /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# elif __GNUC__ >= 11 + __asm__(".p2align 3"); +# else + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif +#endif + + for (; ; ) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; + if (UNLIKELY(!--nbSeq)) + break; + BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); + RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ + { + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } + { size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + } + + return op-ostart; +} + +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_body"); + (void)frame; + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; + { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + + ZSTD_STATIC_ASSERT( + BIT_DStream_unfinished < BIT_DStream_completed && + BIT_DStream_endOfBuffer < BIT_DStream_completed && + BIT_DStream_completed < BIT_DStream_overflow); + +#if defined(__GNUC__) && defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ >= 7 + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); # else - __asm__(".p2align 4"); + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); # endif #endif + for ( ; ; ) { seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); @@ -1218,6 +1632,16 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } + +static size_t +ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT @@ -1250,10 +1674,10 @@ ZSTD_decompressSequencesLong_body( const BYTE* ip = (const BYTE*)seqStart; const BYTE* const iend = ip + seqSize; BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = ostart + maxDstSize; + BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; BYTE* op = ostart; const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* litBufferEnd = dctx->litBufferEnd; const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); @@ -1289,32 +1713,94 @@ ZSTD_decompressSequencesLong_body( } RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, ""); - /* decode and decompress */ - for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) { - seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); + /* decompress without stomping litBuffer */ + for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) { + seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); + size_t oneSeqSize; + + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) + { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); - sequences[seqNb & STORED_SEQS_MASK] = sequence; - op += oneSeqSize; + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ + oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] = sequence; + op += oneSeqSize; + } } RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, ""); /* finish queue */ seqNb -= seqAdvance; for ( ; seqNb<nbSeq ; seqNb++) { - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); + seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); + if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) + { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; + op += leftoverLit; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + { + size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - op += oneSeqSize; + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } + } + else + { + size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } } /* save reps for next block */ @@ -1322,10 +1808,21 @@ ZSTD_decompressSequencesLong_body( } /* last literal segment */ - { size_t const lastLLSize = litEnd - litPtr; + if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ + { + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op += lastLLSize; + } + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + } + { size_t const lastLLSize = litBufferEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); if (op != NULL) { - ZSTD_memcpy(op, litPtr, lastLLSize); + ZSTD_memmove(op, litPtr, lastLLSize); op += lastLLSize; } } @@ -1349,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, #if DYNAMIC_BMI2 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t DONT_VECTORIZE ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -1359,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } +static BMI2_TARGET_ATTRIBUTE size_t +DONT_VECTORIZE +ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, @@ -1391,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, { DEBUGLOG(5, "ZSTD_decompressSequences"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } #endif - return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +static size_t +ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); +#if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { + return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + } +#endif + return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ @@ -1415,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, { DEBUGLOG(5, "ZSTD_decompressSequencesLong"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } #endif @@ -1456,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable) size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame) + const void* src, size_t srcSize, const int frame, const streaming_operation streaming) { /* blockType == blockCompressed */ const BYTE* ip = (const BYTE*)src; /* isLongOffset must be true if there are long offsets. @@ -1471,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); /* Decode literals section */ - { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); if (ZSTD_isError(litCSize)) return litCSize; ip += litCSize; @@ -1519,7 +2040,10 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG /* else */ - return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + if (dctx->litBufferLocation == ZSTD_split) + return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + else + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); #endif } } @@ -1542,7 +2066,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, { size_t dSize; ZSTD_checkContinuity(dctx, dst, dstCapacity); - dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); dctx->previousDstEnd = (char*)dst + dSize; return dSize; } diff --git a/thirdparty/zstd/decompress/zstd_decompress_block.h b/thirdparty/zstd/decompress/zstd_decompress_block.h index 049a0cd84c..c61a9d0c4b 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_block.h +++ b/thirdparty/zstd/decompress/zstd_decompress_block.h @@ -33,6 +33,12 @@ */ + /* Streaming state is used to inform allocation of the literal buffer */ +typedef enum { + not_streaming = 0, + is_streaming = 1 +} streaming_operation; + /* ZSTD_decompressBlock_internal() : * decompress block, starting at `src`, * into destination buffer `dst`. @@ -41,7 +47,7 @@ */ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame); + const void* src, size_t srcSize, const int frame, const streaming_operation streaming); /* ZSTD_buildFSETable() : * generate FSE decoding table for one symbol (ll, ml or off) @@ -54,7 +60,7 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, */ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2); diff --git a/thirdparty/zstd/decompress/zstd_decompress_internal.h b/thirdparty/zstd/decompress/zstd_decompress_internal.h index ebda0c9031..2b5a53850a 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_internal.h +++ b/thirdparty/zstd/decompress/zstd_decompress_internal.h @@ -20,7 +20,7 @@ * Dependencies *********************************************************/ #include "../common/mem.h" /* BYTE, U16, U32 */ -#include "../common/zstd_internal.h" /* ZSTD_seqSymbol */ +#include "../common/zstd_internal.h" /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */ @@ -40,7 +40,7 @@ static UNUSED_ATTR const U32 OF_base[MaxOff+1] = { 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; -static UNUSED_ATTR const U32 OF_bits[MaxOff+1] = { +static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -106,6 +106,22 @@ typedef struct { size_t ddictPtrCount; } ZSTD_DDictHashSet; +#ifndef ZSTD_DECODER_INTERNAL_BUFFER +# define ZSTD_DECODER_INTERNAL_BUFFER (1 << 16) +#endif + +#define ZSTD_LBMIN 64 +#define ZSTD_LBMAX (128 << 10) + +/* extra buffer, compensates when dst is not large enough to store litBuffer */ +#define ZSTD_LITBUFFEREXTRASIZE BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX) + +typedef enum { + ZSTD_not_in_dst = 0, /* Stored entirely within litExtraBuffer */ + ZSTD_in_dst = 1, /* Stored entirely within dst (in memory after current output write) */ + ZSTD_split = 2 /* Split between litExtraBuffer and dst */ +} ZSTD_litLocation_e; + struct ZSTD_DCtx_s { const ZSTD_seqSymbol* LLTptr; @@ -136,7 +152,9 @@ struct ZSTD_DCtx_s size_t litSize; size_t rleSize; size_t staticSize; +#if DYNAMIC_BMI2 != 0 int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ +#endif /* dictionary */ ZSTD_DDict* ddictLocal; @@ -158,16 +176,21 @@ struct ZSTD_DCtx_s size_t outStart; size_t outEnd; size_t lhSize; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) void* legacyContext; U32 previousLegacyVersion; U32 legacyVersion; +#endif U32 hostageByte; int noForwardProgress; ZSTD_bufferMode_e outBufferMode; ZSTD_outBuffer expectedOutBuffer; /* workspace */ - BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; + BYTE* litBuffer; + const BYTE* litBufferEnd; + ZSTD_litLocation_e litBufferLocation; + BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; size_t oversizedDuration; @@ -183,6 +206,14 @@ struct ZSTD_DCtx_s #endif }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ +MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +#if DYNAMIC_BMI2 != 0 + return dctx->bmi2; +#else + (void)dctx; + return 0; +#endif +} /*-******************************************************* * Shared internal functions |