diff options
Diffstat (limited to 'thirdparty/zstd/decompress')
| -rw-r--r-- | thirdparty/zstd/decompress/huf_decompress.c | 916 | ||||
| -rw-r--r-- | thirdparty/zstd/decompress/huf_decompress_amd64.S | 585 | ||||
| -rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress.c | 107 | ||||
| -rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress_block.c | 964 | ||||
| -rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress_block.h | 10 | ||||
| -rw-r--r-- | thirdparty/zstd/decompress/zstd_decompress_internal.h | 37 | 
6 files changed, 2183 insertions, 436 deletions
diff --git a/thirdparty/zstd/decompress/huf_decompress.c b/thirdparty/zstd/decompress/huf_decompress.c index b93c9a003b..2027188255 100644 --- a/thirdparty/zstd/decompress/huf_decompress.c +++ b/thirdparty/zstd/decompress/huf_decompress.c @@ -22,6 +22,13 @@  #define HUF_STATIC_LINKING_ONLY  #include "../common/huf.h"  #include "../common/error_private.h" +#include "../common/zstd_internal.h" + +/* ************************************************************** +*  Constants +****************************************************************/ + +#define HUF_DECODER_FAST_TABLELOG 11  /* **************************************************************  *  Macros @@ -36,6 +43,30 @@  #error "Cannot force the use of the X1 and X2 decoders at the same time!"  #endif +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE +#else +# define HUF_ASM_X86_64_BMI2_ATTRS +#endif + +#ifdef __cplusplus +# define HUF_EXTERN_C extern "C" +#else +# define HUF_EXTERN_C +#endif +#define HUF_ASM_DECL HUF_EXTERN_C + +#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_BMI2_FUNCTION 1 +#else +# define HUF_NEED_BMI2_FUNCTION 0 +#endif + +#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_DEFAULT_FUNCTION 1 +#else +# define HUF_NEED_DEFAULT_FUNCTION 0 +#endif  /* **************************************************************  *  Error Management @@ -65,7 +96,7 @@          return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \      }                                                                       \                                                                              \ -    static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2(                       \ +    static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2(                          \                    void* dst,  size_t dstSize,                               \              const void* cSrc, size_t cSrcSize,                              \              const HUF_DTable* DTable)                                       \ @@ -107,13 +138,147 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)      return dtd;  } +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +static size_t HUF_initDStream(BYTE const* ip) { +    BYTE const lastByte = ip[7]; +    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; +    size_t const value = MEM_readLEST(ip) | 1; +    assert(bitsConsumed <= 8); +    return value << bitsConsumed; +} +typedef struct { +    BYTE const* ip[4]; +    BYTE* op[4]; +    U64 bits[4]; +    void const* dt; +    BYTE const* ilimit; +    BYTE* oend; +    BYTE const* iend[4]; +} HUF_DecompressAsmArgs; + +/** + * Initializes args for the asm decoding loop. + * @returns 0 on success + *          1 if the fallback implementation should be used. + *          Or an error code on failure. + */ +static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +{ +    void const* dt = DTable + 1; +    U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + +    const BYTE* const ilimit = (const BYTE*)src + 6 + 8; + +    BYTE* const oend = (BYTE*)dst + dstSize; + +    /* The following condition is false on x32 platform, +     * but HUF_asm is not compatible with this ABI */ +    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; + +    /* strict minimum : jump table + 1 byte per stream */ +    if (srcSize < 10) +        return ERROR(corruption_detected); + +    /* Must have at least 8 bytes per stream because we don't handle initializing smaller bit containers. +     * If table log is not correct at this point, fallback to the old decoder. +     * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. +     */ +    if (dtLog != HUF_DECODER_FAST_TABLELOG) +        return 1; + +    /* Read the jump table. */ +    { +        const BYTE* const istart = (const BYTE*)src; +        size_t const length1 = MEM_readLE16(istart); +        size_t const length2 = MEM_readLE16(istart+2); +        size_t const length3 = MEM_readLE16(istart+4); +        size_t const length4 = srcSize - (length1 + length2 + length3 + 6); +        args->iend[0] = istart + 6;  /* jumpTable */ +        args->iend[1] = args->iend[0] + length1; +        args->iend[2] = args->iend[1] + length2; +        args->iend[3] = args->iend[2] + length3; + +        /* HUF_initDStream() requires this, and this small of an input +         * won't benefit from the ASM loop anyways. +         * length1 must be >= 16 so that ip[0] >= ilimit before the loop +         * starts. +         */ +        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +            return 1; +        if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */ +    } +    /* ip[] contains the position that is currently loaded into bits[]. */ +    args->ip[0] = args->iend[1] - sizeof(U64); +    args->ip[1] = args->iend[2] - sizeof(U64); +    args->ip[2] = args->iend[3] - sizeof(U64); +    args->ip[3] = (BYTE const*)src + srcSize - sizeof(U64); + +    /* op[] contains the output pointers. */ +    args->op[0] = (BYTE*)dst; +    args->op[1] = args->op[0] + (dstSize+3)/4; +    args->op[2] = args->op[1] + (dstSize+3)/4; +    args->op[3] = args->op[2] + (dstSize+3)/4; + +    /* No point to call the ASM loop for tiny outputs. */ +    if (args->op[3] >= oend) +        return 1; + +    /* bits[] is the bit container. +        * It is read from the MSB down to the LSB. +        * It is shifted left as it is read, and zeros are +        * shifted in. After the lowest valid bit a 1 is +        * set, so that CountTrailingZeros(bits[]) can be used +        * to count how many bits we've consumed. +        */ +    args->bits[0] = HUF_initDStream(args->ip[0]); +    args->bits[1] = HUF_initDStream(args->ip[1]); +    args->bits[2] = HUF_initDStream(args->ip[2]); +    args->bits[3] = HUF_initDStream(args->ip[3]); + +    /* If ip[] >= ilimit, it is guaranteed to be safe to +        * reload bits[]. It may be beyond its section, but is +        * guaranteed to be valid (>= istart). +        */ +    args->ilimit = ilimit; + +    args->oend = oend; +    args->dt = dt; + +    return 0; +} + +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) +{ +    /* Validate that we haven't overwritten. */ +    if (args->op[stream] > segmentEnd) +        return ERROR(corruption_detected); +    /* Validate that we haven't read beyond iend[]. +        * Note that ip[] may be < iend[] because the MSB is +        * the next bit to read, and we may have consumed 100% +        * of the stream, so down to iend[i] - 8 is valid. +        */ +    if (args->ip[stream] < args->iend[stream] - 8) +        return ERROR(corruption_detected); + +    /* Construct the BIT_DStream_t. */ +    bit->bitContainer = MEM_readLE64(args->ip[stream]); +    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); +    bit->start = (const char*)args->iend[0]; +    bit->limitPtr = bit->start + sizeof(size_t); +    bit->ptr = (const char*)args->ip[stream]; + +    return 0; +} +#endif +  #ifndef HUF_FORCE_DECOMPRESS_X2  /*-***************************/  /*  single-symbol decoding   */  /*-***************************/ -typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1;   /* single-symbol decoding */ +typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decoding */  /**   * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entries at @@ -122,14 +287,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1;   /* single-symbol decodi  static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {      U64 D4;      if (MEM_isLittleEndian()) { -        D4 = symbol + (nbBits << 8); -    } else {          D4 = (symbol << 8) + nbBits; +    } else { +        D4 = symbol + (nbBits << 8);      }      D4 *= 0x0001000100010001ULL;      return D4;  } +/** + * Increase the tableLog to targetTableLog and rescales the stats. + * If tableLog > targetTableLog this is a no-op. + * @returns New tableLog + */ +static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols, U32 tableLog, U32 targetTableLog) +{ +    if (tableLog > targetTableLog) +        return tableLog; +    if (tableLog < targetTableLog) { +        U32 const scale = targetTableLog - tableLog; +        U32 s; +        /* Increase the weight for all non-zero probability symbols by scale. */ +        for (s = 0; s < nbSymbols; ++s) { +            huffWeight[s] += (BYTE)((huffWeight[s] == 0) ? 0 : scale); +        } +        /* Update rankVal to reflect the new weights. +         * All weights except 0 get moved to weight + scale. +         * Weights [1, scale] are empty. +         */ +        for (s = targetTableLog; s > scale; --s) { +            rankVal[s] = rankVal[s - scale]; +        } +        for (s = scale; s > 0; --s) { +            rankVal[s] = 0; +        } +    } +    return targetTableLog; +} +  typedef struct {          U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];          U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; @@ -162,8 +357,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr      iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);      if (HUF_isError(iSize)) return iSize; +      /* Table header */      {   DTableDesc dtd = HUF_getDTableDesc(DTable); +        U32 const maxTableLog = dtd.maxTableLog + 1; +        U32 const targetTableLog = MIN(maxTableLog, HUF_DECODER_FAST_TABLELOG); +        tableLog = HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbSymbols, tableLog, targetTableLog);          if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */          dtd.tableType = 0;          dtd.tableLog = (BYTE)tableLog; @@ -207,7 +406,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr      /* fill DTable       * We fill all entries of each weight in order. -     * That way length is a constant for each iteration of the outter loop. +     * That way length is a constant for each iteration of the outer loop.       * We can switch based on the length to a different inner loop which is       * optimized for that particular case.       */ @@ -304,11 +503,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons      BYTE* const pStart = p;      /* up to 4 symbols at a time */ -    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { -        HUF_DECODE_SYMBOLX1_2(p, bitDPtr); -        HUF_DECODE_SYMBOLX1_1(p, bitDPtr); -        HUF_DECODE_SYMBOLX1_2(p, bitDPtr); -        HUF_DECODE_SYMBOLX1_0(p, bitDPtr); +    if ((pEnd - p) > 3) { +        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { +            HUF_DECODE_SYMBOLX1_2(p, bitDPtr); +            HUF_DECODE_SYMBOLX1_1(p, bitDPtr); +            HUF_DECODE_SYMBOLX1_2(p, bitDPtr); +            HUF_DECODE_SYMBOLX1_0(p, bitDPtr); +        } +    } else { +        BIT_reloadDStream(bitDPtr);      }      /* [0-3] symbols remaining */ @@ -388,33 +591,36 @@ HUF_decompress4X1_usingDTable_internal_body(          U32 endSignal = 1;          if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */ +        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */          CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );          CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );          CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );          CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );          /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ -        for ( ; (endSignal) & (op4 < olimit) ; ) { -            HUF_DECODE_SYMBOLX1_2(op1, &bitD1); -            HUF_DECODE_SYMBOLX1_2(op2, &bitD2); -            HUF_DECODE_SYMBOLX1_2(op3, &bitD3); -            HUF_DECODE_SYMBOLX1_2(op4, &bitD4); -            HUF_DECODE_SYMBOLX1_1(op1, &bitD1); -            HUF_DECODE_SYMBOLX1_1(op2, &bitD2); -            HUF_DECODE_SYMBOLX1_1(op3, &bitD3); -            HUF_DECODE_SYMBOLX1_1(op4, &bitD4); -            HUF_DECODE_SYMBOLX1_2(op1, &bitD1); -            HUF_DECODE_SYMBOLX1_2(op2, &bitD2); -            HUF_DECODE_SYMBOLX1_2(op3, &bitD3); -            HUF_DECODE_SYMBOLX1_2(op4, &bitD4); -            HUF_DECODE_SYMBOLX1_0(op1, &bitD1); -            HUF_DECODE_SYMBOLX1_0(op2, &bitD2); -            HUF_DECODE_SYMBOLX1_0(op3, &bitD3); -            HUF_DECODE_SYMBOLX1_0(op4, &bitD4); -            endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; -            endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; -            endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; -            endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; +        if ((size_t)(oend - op4) >= sizeof(size_t)) { +            for ( ; (endSignal) & (op4 < olimit) ; ) { +                HUF_DECODE_SYMBOLX1_2(op1, &bitD1); +                HUF_DECODE_SYMBOLX1_2(op2, &bitD2); +                HUF_DECODE_SYMBOLX1_2(op3, &bitD3); +                HUF_DECODE_SYMBOLX1_2(op4, &bitD4); +                HUF_DECODE_SYMBOLX1_1(op1, &bitD1); +                HUF_DECODE_SYMBOLX1_1(op2, &bitD2); +                HUF_DECODE_SYMBOLX1_1(op3, &bitD3); +                HUF_DECODE_SYMBOLX1_1(op4, &bitD4); +                HUF_DECODE_SYMBOLX1_2(op1, &bitD1); +                HUF_DECODE_SYMBOLX1_2(op2, &bitD2); +                HUF_DECODE_SYMBOLX1_2(op3, &bitD3); +                HUF_DECODE_SYMBOLX1_2(op4, &bitD4); +                HUF_DECODE_SYMBOLX1_0(op1, &bitD1); +                HUF_DECODE_SYMBOLX1_0(op2, &bitD2); +                HUF_DECODE_SYMBOLX1_0(op3, &bitD3); +                HUF_DECODE_SYMBOLX1_0(op4, &bitD4); +                endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; +                endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; +                endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; +                endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; +            }          }          /* check corruption */ @@ -440,6 +646,79 @@ HUF_decompress4X1_usingDTable_internal_body(      }  } +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, +                    size_t cSrcSize, HUF_DTable const* DTable) { +    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, +                    size_t cSrcSize, HUF_DTable const* DTable) { +    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS +size_t +HUF_decompress4X1_usingDTable_internal_bmi2_asm( +          void* dst,  size_t dstSize, +    const void* cSrc, size_t cSrcSize, +    const HUF_DTable* DTable) +{ +    void const* dt = DTable + 1; +    const BYTE* const iend = (const BYTE*)cSrc + 6; +    BYTE* const oend = (BYTE*)dst + dstSize; +    HUF_DecompressAsmArgs args; +    { +        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +        FORWARD_IF_ERROR(ret, "Failed to init asm args"); +        if (ret != 0) +            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +    } + +    assert(args.ip[0] >= args.ilimit); +    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); + +    /* Our loop guarantees that ip[] >= ilimit and that we haven't +    * overwritten any op[]. +    */ +    assert(args.ip[0] >= iend); +    assert(args.ip[1] >= iend); +    assert(args.ip[2] >= iend); +    assert(args.ip[3] >= iend); +    assert(args.op[3] <= oend); +    (void)iend; + +    /* finish bit streams one by one. */ +    { +        size_t const segmentSize = (dstSize+3) / 4; +        BYTE* segmentEnd = (BYTE*)dst; +        int i; +        for (i = 0; i < 4; ++i) { +            BIT_DStream_t bit; +            if (segmentSize <= (size_t)(oend - segmentEnd)) +                segmentEnd += segmentSize; +            else +                segmentEnd = oend; +            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); +            /* Decompress and validate that we've produced exactly the expected length. */ +            args.op[i] += HUF_decodeStreamX1(args.op[i], &bit, segmentEnd, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); +            if (args.op[i] != segmentEnd) return ERROR(corruption_detected); +        } +    } + +    /* decoded size */ +    return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */  typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,                                                 const void *cSrc, @@ -447,8 +726,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,                                                 const HUF_DTable *DTable);  HUF_DGEN(HUF_decompress1X1_usingDTable_internal) -HUF_DGEN(HUF_decompress4X1_usingDTable_internal) +static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 +    if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 +        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else +        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif +    } +#else +    (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else +    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +}  size_t HUF_decompress1X1_usingDTable( @@ -518,106 +817,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,  /* *************************/  typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /* double-symbols decoding */ -typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef struct { BYTE symbol; } sortedSymbol_t;  typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];  typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; +/** + * Constructs a HUF_DEltX2 in a U32. + */ +static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int level) +{ +    U32 seq; +    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) == 0); +    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) == 2); +    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) == 3); +    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(U32)); +    if (MEM_isLittleEndian()) { +        seq = level == 1 ? symbol : (baseSeq + (symbol << 8)); +        return seq + (nbBits << 16) + ((U32)level << 24); +    } else { +        seq = level == 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); +        return (seq << 16) + (nbBits << 8) + (U32)level; +    } +} -/* HUF_fillDTableX2Level2() : - * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ -static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, -                           const U32* rankValOrigin, const int minWeight, -                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, -                           U32 nbBitsBaseline, U16 baseSeq, U32* wksp, size_t wkspSize) +/** + * Constructs a HUF_DEltX2. + */ +static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int level)  {      HUF_DEltX2 DElt; -    U32* rankVal = wksp; +    U32 const val = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); +    DEBUG_STATIC_ASSERT(sizeof(DElt) == sizeof(val)); +    ZSTD_memcpy(&DElt, &val, sizeof(val)); +    return DElt; +} -    assert(wkspSize >= HUF_TABLELOG_MAX + 1); -    (void)wkspSize; -    /* get pre-calculated rankVal */ -    ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); +/** + * Constructs 2 HUF_DEltX2s and packs them into a U64. + */ +static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int level) +{ +    U32 DElt = HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); +    return (U64)DElt + ((U64)DElt << 32); +} -    /* fill skipped values */ -    if (minWeight>1) { -        U32 i, skipSize = rankVal[minWeight]; -        MEM_writeLE16(&(DElt.sequence), baseSeq); -        DElt.nbBits   = (BYTE)(consumed); -        DElt.length   = 1; -        for (i = 0; i < skipSize; i++) -            DTable[i] = DElt; +/** + * Fills the DTable rank with all the symbols from [begin, end) that are each + * nbBits long. + * + * @param DTableRank The start of the rank in the DTable. + * @param begin The first symbol to fill (inclusive). + * @param end The last symbol to fill (exclusive). + * @param nbBits Each symbol is nbBits long. + * @param tableLog The table log. + * @param baseSeq If level == 1 { 0 } else { the first level symbol } + * @param level The level in the table. Must be 1 or 2. + */ +static void HUF_fillDTableX2ForWeight( +    HUF_DEltX2* DTableRank, +    sortedSymbol_t const* begin, sortedSymbol_t const* end, +    U32 nbBits, U32 tableLog, +    U16 baseSeq, int const level) +{ +    U32 const length = 1U << ((tableLog - nbBits) & 0x1F /* quiet static-analyzer */); +    const sortedSymbol_t* ptr; +    assert(level >= 1 && level <= 2); +    switch (length) { +    case 1: +        for (ptr = begin; ptr != end; ++ptr) { +            HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); +            *DTableRank++ = DElt; +        } +        break; +    case 2: +        for (ptr = begin; ptr != end; ++ptr) { +            HUF_DEltX2 const DElt = HUF_buildDEltX2(ptr->symbol, nbBits, baseSeq, level); +            DTableRank[0] = DElt; +            DTableRank[1] = DElt; +            DTableRank += 2; +        } +        break; +    case 4: +        for (ptr = begin; ptr != end; ++ptr) { +            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); +            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); +            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); +            DTableRank += 4; +        } +        break; +    case 8: +        for (ptr = begin; ptr != end; ++ptr) { +            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); +            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); +            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); +            ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); +            ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); +            DTableRank += 8; +        } +        break; +    default: +        for (ptr = begin; ptr != end; ++ptr) { +            U64 const DEltX2 = HUF_buildDEltX2U64(ptr->symbol, nbBits, baseSeq, level); +            HUF_DEltX2* const DTableRankEnd = DTableRank + length; +            for (; DTableRank != DTableRankEnd; DTableRank += 8) { +                ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); +                ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); +                ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); +                ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); +            } +        } +        break;      } +} -    /* fill DTable */ -    {   U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */ -            const U32 symbol = sortedSymbols[s].symbol; -            const U32 weight = sortedSymbols[s].weight; -            const U32 nbBits = nbBitsBaseline - weight; -            const U32 length = 1 << (sizeLog-nbBits); -            const U32 start = rankVal[weight]; -            U32 i = start; -            const U32 end = start + length; - -            MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8))); -            DElt.nbBits = (BYTE)(nbBits + consumed); -            DElt.length = 2; -            do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */ +/* HUF_fillDTableX2Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 consumedBits, +                           const U32* rankVal, const int minWeight, const int maxWeight1, +                           const sortedSymbol_t* sortedSymbols, U32 const* rankStart, +                           U32 nbBitsBaseline, U16 baseSeq) +{ +    /* Fill skipped values (all positions up to rankVal[minWeight]). +     * These are positions only get a single symbol because the combined weight +     * is too large. +     */ +    if (minWeight>1) { +        U32 const length = 1U << ((targetLog - consumedBits) & 0x1F /* quiet static-analyzer */); +        U64 const DEltX2 = HUF_buildDEltX2U64(baseSeq, consumedBits, /* baseSeq */ 0, /* level */ 1); +        int const skipSize = rankVal[minWeight]; +        assert(length > 1); +        assert((U32)skipSize < length); +        switch (length) { +        case 2: +            assert(skipSize == 1); +            ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); +            break; +        case 4: +            assert(skipSize <= 4); +            ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); +            ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); +            break; +        default: +            { +                int i; +                for (i = 0; i < skipSize; i += 8) { +                    ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); +                    ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); +                    ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); +                    ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); +                } +            } +        } +    } -            rankVal[weight] += length; -    }   } +    /* Fill each of the second level symbols by weight. */ +    { +        int w; +        for (w = minWeight; w < maxWeight1; ++w) { +            int const begin = rankStart[w]; +            int const end = rankStart[w+1]; +            U32 const nbBits = nbBitsBaseline - w; +            U32 const totalBits = nbBits + consumedBits; +            HUF_fillDTableX2ForWeight( +                DTable + rankVal[w], +                sortedSymbols + begin, sortedSymbols + end, +                totalBits, targetLog, +                baseSeq, /* level */ 2); +        } +    }  } -  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, -                           const sortedSymbol_t* sortedList, const U32 sortedListSize, +                           const sortedSymbol_t* sortedList,                             const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, -                           const U32 nbBitsBaseline, U32* wksp, size_t wkspSize) +                           const U32 nbBitsBaseline)  { -    U32* rankVal = wksp; +    U32* const rankVal = rankValOrigin[0];      const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */      const U32 minBits  = nbBitsBaseline - maxWeight; -    U32 s; - -    assert(wkspSize >= HUF_TABLELOG_MAX + 1); -    wksp += HUF_TABLELOG_MAX + 1; -    wkspSize -= HUF_TABLELOG_MAX + 1; - -    ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + 1)); - -    /* fill DTable */ -    for (s=0; s<sortedListSize; s++) { -        const U16 symbol = sortedList[s].symbol; -        const U32 weight = sortedList[s].weight; -        const U32 nbBits = nbBitsBaseline - weight; -        const U32 start = rankVal[weight]; -        const U32 length = 1 << (targetLog-nbBits); - -        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */ -            U32 sortedRank; +    int w; +    int const wEnd = (int)maxWeight + 1; + +    /* Fill DTable in order of weight. */ +    for (w = 1; w < wEnd; ++w) { +        int const begin = (int)rankStart[w]; +        int const end = (int)rankStart[w+1]; +        U32 const nbBits = nbBitsBaseline - w; + +        if (targetLog-nbBits >= minBits) { +            /* Enough room for a second symbol. */ +            int start = rankVal[w]; +            U32 const length = 1U << ((targetLog - nbBits) & 0x1F /* quiet static-analyzer */);              int minWeight = nbBits + scaleLog; +            int s;              if (minWeight < 1) minWeight = 1; -            sortedRank = rankStart[minWeight]; -            HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, -                           rankValOrigin[nbBits], minWeight, -                           sortedList+sortedRank, sortedListSize-sortedRank, -                           nbBitsBaseline, symbol, wksp, wkspSize); +            /* Fill the DTable for every symbol of weight w. +             * These symbols get at least 1 second symbol. +             */ +            for (s = begin; s != end; ++s) { +                HUF_fillDTableX2Level2( +                    DTable + start, targetLog, nbBits, +                    rankValOrigin[nbBits], minWeight, wEnd, +                    sortedList, rankStart, +                    nbBitsBaseline, sortedList[s].symbol); +                start += length; +            }          } else { -            HUF_DEltX2 DElt; -            MEM_writeLE16(&(DElt.sequence), symbol); -            DElt.nbBits = (BYTE)(nbBits); -            DElt.length = 1; -            {   U32 const end = start + length; -                U32 u; -                for (u = start; u < end; u++) DTable[u] = DElt; -        }   } -        rankVal[weight] += length; +            /* Only a single symbol. */ +            HUF_fillDTableX2ForWeight( +                DTable + rankVal[w], +                sortedList + begin, sortedList + end, +                nbBits, targetLog, +                /* baseSeq */ 0, /* level */ 1); +        }      }  }  typedef struct {      rankValCol_t rankVal[HUF_TABLELOG_MAX];      U32 rankStats[HUF_TABLELOG_MAX + 1]; -    U32 rankStart0[HUF_TABLELOG_MAX + 2]; +    U32 rankStart0[HUF_TABLELOG_MAX + 3];      sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];      BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];      U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; @@ -627,9 +1046,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,                         const void* src, size_t srcSize,                               void* workSpace, size_t wkspSize)  { -    U32 tableLog, maxW, sizeOfSort, nbSymbols; +    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +} + +size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +                       const void* src, size_t srcSize, +                             void* workSpace, size_t wkspSize, int bmi2) +{ +    U32 tableLog, maxW, nbSymbols;      DTableDesc dtd = HUF_getDTableDesc(DTable); -    U32 const maxTableLog = dtd.maxTableLog; +    U32 maxTableLog = dtd.maxTableLog;      size_t iSize;      void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */      HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; @@ -647,11 +1073,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,      if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);      /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */ -    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), /* bmi2 */ 0); +    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);      if (HUF_isError(iSize)) return iSize;      /* check result */      if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */ +    if (tableLog <= HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECODER_FAST_TABLELOG) maxTableLog = HUF_DECODER_FAST_TABLELOG;      /* find maxWeight */      for (maxW = tableLog; wksp->rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */ @@ -664,7 +1091,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,              rankStart[w] = curr;          }          rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/ -        sizeOfSort = nextRankStart; +        rankStart[maxW+1] = nextRankStart;      }      /* sort symbols by weight */ @@ -673,7 +1100,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,              U32 const w = wksp->weightList[s];              U32 const r = rankStart[w]++;              wksp->sortedSymbol[r].symbol = (BYTE)s; -            wksp->sortedSymbol[r].weight = (BYTE)w;          }          rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */      } @@ -698,10 +1124,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,      }   }   }   }      HUF_fillDTableX2(dt, maxTableLog, -                   wksp->sortedSymbol, sizeOfSort, +                   wksp->sortedSymbol,                     wksp->rankStart0, wksp->rankVal, maxW, -                   tableLog+1, -                   wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32)); +                   tableLog+1);      dtd.tableLog = (BYTE)maxTableLog;      dtd.tableType = 1; @@ -714,7 +1139,7 @@ FORCE_INLINE_TEMPLATE U32  HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)  {      size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */ -    ZSTD_memcpy(op, dt+val, 2); +    ZSTD_memcpy(op, &dt[val].sequence, 2);      BIT_skipBits(DStream, dt[val].nbBits);      return dt[val].length;  } @@ -723,15 +1148,17 @@ FORCE_INLINE_TEMPLATE U32  HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)  {      size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */ -    ZSTD_memcpy(op, dt+val, 1); -    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); -    else { +    ZSTD_memcpy(op, &dt[val].sequence, 1); +    if (dt[val].length==1) { +        BIT_skipBits(DStream, dt[val].nbBits); +    } else {          if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {              BIT_skipBits(DStream, dt[val].nbBits);              if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))                  /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */                  DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); -    }   } +        } +    }      return 1;  } @@ -753,19 +1180,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,      BYTE* const pStart = p;      /* up to 8 symbols at a time */ -    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { -        HUF_DECODE_SYMBOLX2_2(p, bitDPtr); -        HUF_DECODE_SYMBOLX2_1(p, bitDPtr); -        HUF_DECODE_SYMBOLX2_2(p, bitDPtr); -        HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +    if ((size_t)(pEnd - p) >= sizeof(bitDPtr->bitContainer)) { +        if (dtLog <= 11 && MEM_64bits()) { +            /* up to 10 symbols at a time */ +            while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-9)) { +                HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +                HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +                HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +                HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +                HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +            } +        } else { +            /* up to 8 symbols at a time */ +            while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { +                HUF_DECODE_SYMBOLX2_2(p, bitDPtr); +                HUF_DECODE_SYMBOLX2_1(p, bitDPtr); +                HUF_DECODE_SYMBOLX2_2(p, bitDPtr); +                HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +            } +        } +    } else { +        BIT_reloadDStream(bitDPtr);      }      /* closer to end : up to 2 symbols at a time */ -    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) -        HUF_DECODE_SYMBOLX2_0(p, bitDPtr); +    if ((size_t)(pEnd - p) >= 2) { +        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) +            HUF_DECODE_SYMBOLX2_0(p, bitDPtr); -    while (p <= pEnd-2) -        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */ +        while (p <= pEnd-2) +            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */ +    }      if (p < pEnd)          p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); @@ -799,7 +1244,6 @@ HUF_decompress1X2_usingDTable_internal_body(      /* decoded size */      return dstSize;  } -  FORCE_INLINE_TEMPLATE size_t  HUF_decompress4X2_usingDTable_internal_body(            void* dst,  size_t dstSize, @@ -841,57 +1285,60 @@ HUF_decompress4X2_usingDTable_internal_body(          U32 const dtLog = dtd.tableLog;          if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */ +        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */          CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );          CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );          CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );          CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );          /* 16-32 symbols per loop (4-8 symbols per stream) */ -        for ( ; (endSignal) & (op4 < olimit); ) { +        if ((size_t)(oend - op4) >= sizeof(size_t)) { +            for ( ; (endSignal) & (op4 < olimit); ) {  #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -            HUF_DECODE_SYMBOLX2_2(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_1(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_2(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_0(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_2(op2, &bitD2); -            HUF_DECODE_SYMBOLX2_1(op2, &bitD2); -            HUF_DECODE_SYMBOLX2_2(op2, &bitD2); -            HUF_DECODE_SYMBOLX2_0(op2, &bitD2); -            endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; -            endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; -            HUF_DECODE_SYMBOLX2_2(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_1(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_2(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_0(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_2(op4, &bitD4); -            HUF_DECODE_SYMBOLX2_1(op4, &bitD4); -            HUF_DECODE_SYMBOLX2_2(op4, &bitD4); -            HUF_DECODE_SYMBOLX2_0(op4, &bitD4); -            endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; -            endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; +                HUF_DECODE_SYMBOLX2_2(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_1(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_2(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_0(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_2(op2, &bitD2); +                HUF_DECODE_SYMBOLX2_1(op2, &bitD2); +                HUF_DECODE_SYMBOLX2_2(op2, &bitD2); +                HUF_DECODE_SYMBOLX2_0(op2, &bitD2); +                endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; +                endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; +                HUF_DECODE_SYMBOLX2_2(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_1(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_2(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_0(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_2(op4, &bitD4); +                HUF_DECODE_SYMBOLX2_1(op4, &bitD4); +                HUF_DECODE_SYMBOLX2_2(op4, &bitD4); +                HUF_DECODE_SYMBOLX2_0(op4, &bitD4); +                endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; +                endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;  #else -            HUF_DECODE_SYMBOLX2_2(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_2(op2, &bitD2); -            HUF_DECODE_SYMBOLX2_2(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_2(op4, &bitD4); -            HUF_DECODE_SYMBOLX2_1(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_1(op2, &bitD2); -            HUF_DECODE_SYMBOLX2_1(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_1(op4, &bitD4); -            HUF_DECODE_SYMBOLX2_2(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_2(op2, &bitD2); -            HUF_DECODE_SYMBOLX2_2(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_2(op4, &bitD4); -            HUF_DECODE_SYMBOLX2_0(op1, &bitD1); -            HUF_DECODE_SYMBOLX2_0(op2, &bitD2); -            HUF_DECODE_SYMBOLX2_0(op3, &bitD3); -            HUF_DECODE_SYMBOLX2_0(op4, &bitD4); -            endSignal = (U32)LIKELY( -                        (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) -                      & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) -                      & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) -                      & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); +                HUF_DECODE_SYMBOLX2_2(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_2(op2, &bitD2); +                HUF_DECODE_SYMBOLX2_2(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_2(op4, &bitD4); +                HUF_DECODE_SYMBOLX2_1(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_1(op2, &bitD2); +                HUF_DECODE_SYMBOLX2_1(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_1(op4, &bitD4); +                HUF_DECODE_SYMBOLX2_2(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_2(op2, &bitD2); +                HUF_DECODE_SYMBOLX2_2(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_2(op4, &bitD4); +                HUF_DECODE_SYMBOLX2_0(op1, &bitD1); +                HUF_DECODE_SYMBOLX2_0(op2, &bitD2); +                HUF_DECODE_SYMBOLX2_0(op3, &bitD3); +                HUF_DECODE_SYMBOLX2_0(op4, &bitD4); +                endSignal = (U32)LIKELY((U32) +                            (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) +                        & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) +                        & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) +                        & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));  #endif +            }          }          /* check corruption */ @@ -915,8 +1362,99 @@ HUF_decompress4X2_usingDTable_internal_body(      }  } +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, +                    size_t cSrcSize, HUF_DTable const* DTable) { +    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, +                    size_t cSrcSize, HUF_DTable const* DTable) { +    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_bmi2_asm( +          void* dst,  size_t dstSize, +    const void* cSrc, size_t cSrcSize, +    const HUF_DTable* DTable) { +    void const* dt = DTable + 1; +    const BYTE* const iend = (const BYTE*)cSrc + 6; +    BYTE* const oend = (BYTE*)dst + dstSize; +    HUF_DecompressAsmArgs args; +    { +        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +        FORWARD_IF_ERROR(ret, "Failed to init asm args"); +        if (ret != 0) +            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +    } + +    assert(args.ip[0] >= args.ilimit); +    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); + +    /* note : op4 already verified within main loop */ +    assert(args.ip[0] >= iend); +    assert(args.ip[1] >= iend); +    assert(args.ip[2] >= iend); +    assert(args.ip[3] >= iend); +    assert(args.op[3] <= oend); +    (void)iend; + +    /* finish bitStreams one by one */ +    { +        size_t const segmentSize = (dstSize+3) / 4; +        BYTE* segmentEnd = (BYTE*)dst; +        int i; +        for (i = 0; i < 4; ++i) { +            BIT_DStream_t bit; +            if (segmentSize <= (size_t)(oend - segmentEnd)) +                segmentEnd += segmentSize; +            else +                segmentEnd = oend; +            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segmentEnd), "corruption"); +            args.op[i] += HUF_decodeStreamX2(args.op[i], &bit, segmentEnd, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); +            if (args.op[i] != segmentEnd) +                return ERROR(corruption_detected); +        } +    } + +    /* decoded size */ +    return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + +static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 +    if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 +        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# else +        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); +# endif +    } +#else +    (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#else +    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#endif +} +  HUF_DGEN(HUF_decompress1X2_usingDTable_internal) -HUF_DGEN(HUF_decompress4X2_usingDTable_internal)  size_t HUF_decompress1X2_usingDTable(            void* dst,  size_t dstSize, @@ -1025,25 +1563,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; -static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = +static const algo_time_t algoTime[16 /* Quantization */][2 /* single, double */] =  {      /* single, double, quad */ -    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */ -    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */ -    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */ -    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */ -    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */ -    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */ -    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */ -    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */ -    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */ -    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */ -    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */ -    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */ -    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */ -    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */ -    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */ -    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */ +    {{0,0}, {1,1}},  /* Q==0 : impossible */ +    {{0,0}, {1,1}},  /* Q==1 : impossible */ +    {{ 150,216}, { 381,119}},   /* Q == 2 : 12-18% */ +    {{ 170,205}, { 514,112}},   /* Q == 3 : 18-25% */ +    {{ 177,199}, { 539,110}},   /* Q == 4 : 25-32% */ +    {{ 197,194}, { 644,107}},   /* Q == 5 : 32-38% */ +    {{ 221,192}, { 735,107}},   /* Q == 6 : 38-44% */ +    {{ 256,189}, { 881,106}},   /* Q == 7 : 44-50% */ +    {{ 359,188}, {1167,109}},   /* Q == 8 : 50-56% */ +    {{ 582,187}, {1570,114}},   /* Q == 9 : 56-62% */ +    {{ 688,187}, {1712,122}},   /* Q ==10 : 62-69% */ +    {{ 825,186}, {1965,136}},   /* Q ==11 : 69-75% */ +    {{ 976,185}, {2131,150}},   /* Q ==12 : 75-81% */ +    {{1180,186}, {2070,175}},   /* Q ==13 : 81-87% */ +    {{1377,185}, {1731,202}},   /* Q ==14 : 87-93% */ +    {{1412,185}, {1695,202}},   /* Q ==15 : 93-99% */  };  #endif @@ -1070,7 +1608,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)          U32 const D256 = (U32)(dstSize >> 8);          U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);          U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); -        DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, to reduce cache eviction */ +        DTime1 += DTime1 >> 5;  /* small advantage to algorithm using less memory, to reduce cache eviction */          return DTime1 < DTime0;      }  #endif diff --git a/thirdparty/zstd/decompress/huf_decompress_amd64.S b/thirdparty/zstd/decompress/huf_decompress_amd64.S new file mode 100644 index 0000000000..49589cb611 --- /dev/null +++ b/thirdparty/zstd/decompress/huf_decompress_amd64.S @@ -0,0 +1,585 @@ +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include "../common/portability_macros.h" + +/* Stack marking + * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart + */ +#if defined(__ELF__) && defined(__GNUC__) +.section .note.GNU-stack,"",%progbits +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +/* Calling convention: + * + * %rdi contains the first argument: HUF_DecompressAsmArgs*. + * %rbp isn't maintained (no frame pointer). + * %rsp contains the stack pointer that grows down. + *      No red-zone is assumed, only addresses >= %rsp are used. + * All register contents are preserved. + * + * TODO: Support Windows calling convention. + */ + +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) +.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop +.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +.text + +/* Sets up register mappings for clarity. + * op[], bits[], dtable & ip[0] each get their own register. + * ip[1,2,3] & olimit alias var[]. + * %rax is a scratch register. + */ + +#define op0    rsi +#define op1    rbx +#define op2    rcx +#define op3    rdi + +#define ip0    r8 +#define ip1    r9 +#define ip2    r10 +#define ip3    r11 + +#define bits0  rbp +#define bits1  rdx +#define bits2  r12 +#define bits3  r13 +#define dtable r14 +#define olimit r15 + +/* var[] aliases ip[1,2,3] & olimit + * ip[1,2,3] are saved every iteration. + * olimit is only used in compute_olimit. + */ +#define var0   r15 +#define var1   r9 +#define var2   r10 +#define var3   r11 + +/* 32-bit var registers */ +#define vard0  r15d +#define vard1  r9d +#define vard2  r10d +#define vard3  r11d + +/* Calls X(N) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM(X) \ +    X(0);                  \ +    X(1);                  \ +    X(2);                  \ +    X(3) + +/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ +    X(0, idx);                             \ +    X(1, idx);                             \ +    X(2, idx);                             \ +    X(3, idx) + +/* Define both _HUF_* & HUF_* symbols because MacOS + * C symbols are prefixed with '_' & Linux symbols aren't. + */ +_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: +    /* Save all registers - even if they are callee saved for simplicity. */ +    push %rax +    push %rbx +    push %rcx +    push %rdx +    push %rbp +    push %rsi +    push %rdi +    push %r8 +    push %r9 +    push %r10 +    push %r11 +    push %r12 +    push %r13 +    push %r14 +    push %r15 + +    /* Read HUF_DecompressAsmArgs* args from %rax */ +    movq %rdi, %rax +    movq  0(%rax), %ip0 +    movq  8(%rax), %ip1 +    movq 16(%rax), %ip2 +    movq 24(%rax), %ip3 +    movq 32(%rax), %op0 +    movq 40(%rax), %op1 +    movq 48(%rax), %op2 +    movq 56(%rax), %op3 +    movq 64(%rax), %bits0 +    movq 72(%rax), %bits1 +    movq 80(%rax), %bits2 +    movq 88(%rax), %bits3 +    movq 96(%rax), %dtable +    push %rax      /* argument */ +    push 104(%rax) /* ilimit */ +    push 112(%rax) /* oend */ +    push %olimit   /* olimit space */ + +    subq $24, %rsp + +.L_4X1_compute_olimit: +    /* Computes how many iterations we can do safely +     * %r15, %rax may be clobbered +     * rbx, rdx must be saved +     * op3 & ip0 mustn't be clobbered +     */ +    movq %rbx, 0(%rsp) +    movq %rdx, 8(%rsp) + +    movq 32(%rsp), %rax /* rax = oend */ +    subq %op3,    %rax  /* rax = oend - op3 */ + +    /* r15 = (oend - op3) / 5 */ +    movabsq $-3689348814741910323, %rdx +    mulq %rdx +    movq %rdx, %r15 +    shrq $2, %r15 + +    movq %ip0,     %rax /* rax = ip0 */ +    movq 40(%rsp), %rdx /* rdx = ilimit */ +    subq %rdx,     %rax /* rax = ip0 - ilimit */ +    movq %rax,     %rbx /* rbx = ip0 - ilimit */ + +    /* rdx = (ip0 - ilimit) / 7 */ +    movabsq $2635249153387078803, %rdx +    mulq %rdx +    subq %rdx, %rbx +    shrq %rbx +    addq %rbx, %rdx +    shrq $2, %rdx + +    /* r15 = min(%rdx, %r15) */ +    cmpq %rdx, %r15 +    cmova %rdx, %r15 + +    /* r15 = r15 * 5 */ +    leaq (%r15, %r15, 4), %r15 + +    /* olimit = op3 + r15 */ +    addq %op3, %olimit + +    movq 8(%rsp), %rdx +    movq 0(%rsp), %rbx + +    /* If (op3 + 20 > olimit) */ +    movq %op3, %rax    /* rax = op3 */ +    addq $20,  %rax    /* rax = op3 + 20 */ +    cmpq %rax, %olimit /* op3 + 20 > olimit */ +    jb .L_4X1_exit + +    /* If (ip1 < ip0) go to exit */ +    cmpq %ip0, %ip1 +    jb .L_4X1_exit + +    /* If (ip2 < ip1) go to exit */ +    cmpq %ip1, %ip2 +    jb .L_4X1_exit + +    /* If (ip3 < ip2) go to exit */ +    cmpq %ip2, %ip3 +    jb .L_4X1_exit + +/* Reads top 11 bits from bits[n] + * Loads dt[bits[n]] into var[n] + */ +#define GET_NEXT_DELT(n)                \ +    movq $53, %var##n;                  \ +    shrxq %var##n, %bits##n, %var##n;   \ +    movzwl (%dtable,%var##n,2),%vard##n + +/* var[n] must contain the DTable entry computed with GET_NEXT_DELT + * Moves var[n] to %rax + * bits[n] <<= var[n] & 63 + * op[n][idx] = %rax >> 8 + * %ah is a way to access bits [8, 16) of %rax + */ +#define DECODE_FROM_DELT(n, idx)       \ +    movq %var##n, %rax;                \ +    shlxq %var##n, %bits##n, %bits##n; \ +    movb %ah, idx(%op##n) + +/* Assumes GET_NEXT_DELT has been called. + * Calls DECODE_FROM_DELT then GET_NEXT_DELT + */ +#define DECODE_AND_GET_NEXT(n, idx) \ +    DECODE_FROM_DELT(n, idx);       \ +    GET_NEXT_DELT(n)                \ + +/* // ctz & nbBytes is stored in bits[n] + * // nbBits is stored in %rax + * ctz  = CTZ[bits[n]] + * nbBits  = ctz & 7 + * nbBytes = ctz >> 3 + * op[n]  += 5 + * ip[n]  -= nbBytes + * // Note: x86-64 is little-endian ==> no bswap + * bits[n] = MEM_readST(ip[n]) | 1 + * bits[n] <<= nbBits + */ +#define RELOAD_BITS(n)             \ +    bsfq %bits##n, %bits##n;       \ +    movq %bits##n, %rax;           \ +    andq $7, %rax;                 \ +    shrq $3, %bits##n;             \ +    leaq 5(%op##n), %op##n;        \ +    subq %bits##n, %ip##n;         \ +    movq (%ip##n), %bits##n;       \ +    orq $1, %bits##n;              \ +    shlx %rax, %bits##n, %bits##n + +    /* Store clobbered variables on the stack */ +    movq %olimit, 24(%rsp) +    movq %ip1, 0(%rsp) +    movq %ip2, 8(%rsp) +    movq %ip3, 16(%rsp) + +    /* Call GET_NEXT_DELT for each stream */ +    FOR_EACH_STREAM(GET_NEXT_DELT) + +    .p2align 6 + +.L_4X1_loop_body: +    /* Decode 5 symbols in each of the 4 streams (20 total) +     * Must have called GET_NEXT_DELT for each stream +     */ +    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) +    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) +    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) +    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) +    FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) + +    /* Load ip[1,2,3] from stack (var[] aliases them) +     * ip[] is needed for RELOAD_BITS +     * Each will be stored back to the stack after RELOAD +     */ +    movq 0(%rsp), %ip1 +    movq 8(%rsp), %ip2 +    movq 16(%rsp), %ip3 + +    /* Reload each stream & fetch the next table entry +     * to prepare for the next iteration +     */ +    RELOAD_BITS(0) +    GET_NEXT_DELT(0) + +    RELOAD_BITS(1) +    movq %ip1, 0(%rsp) +    GET_NEXT_DELT(1) + +    RELOAD_BITS(2) +    movq %ip2, 8(%rsp) +    GET_NEXT_DELT(2) + +    RELOAD_BITS(3) +    movq %ip3, 16(%rsp) +    GET_NEXT_DELT(3) + +    /* If op3 < olimit: continue the loop */ +    cmp %op3, 24(%rsp) +    ja .L_4X1_loop_body + +    /* Reload ip[1,2,3] from stack */ +    movq 0(%rsp), %ip1 +    movq 8(%rsp), %ip2 +    movq 16(%rsp), %ip3 + +    /* Re-compute olimit */ +    jmp .L_4X1_compute_olimit + +#undef GET_NEXT_DELT +#undef DECODE_FROM_DELT +#undef DECODE +#undef RELOAD_BITS +.L_4X1_exit: +    addq $24, %rsp + +    /* Restore stack (oend & olimit) */ +    pop %rax /* olimit */ +    pop %rax /* oend */ +    pop %rax /* ilimit */ +    pop %rax /* arg */ + +    /* Save ip / op / bits */ +    movq %ip0,  0(%rax) +    movq %ip1,  8(%rax) +    movq %ip2, 16(%rax) +    movq %ip3, 24(%rax) +    movq %op0, 32(%rax) +    movq %op1, 40(%rax) +    movq %op2, 48(%rax) +    movq %op3, 56(%rax) +    movq %bits0, 64(%rax) +    movq %bits1, 72(%rax) +    movq %bits2, 80(%rax) +    movq %bits3, 88(%rax) + +    /* Restore registers */ +    pop %r15 +    pop %r14 +    pop %r13 +    pop %r12 +    pop %r11 +    pop %r10 +    pop %r9 +    pop %r8 +    pop %rdi +    pop %rsi +    pop %rbp +    pop %rdx +    pop %rcx +    pop %rbx +    pop %rax +    ret + +_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: +HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: +    /* Save all registers - even if they are callee saved for simplicity. */ +    push %rax +    push %rbx +    push %rcx +    push %rdx +    push %rbp +    push %rsi +    push %rdi +    push %r8 +    push %r9 +    push %r10 +    push %r11 +    push %r12 +    push %r13 +    push %r14 +    push %r15 + +    movq %rdi, %rax +    movq  0(%rax), %ip0 +    movq  8(%rax), %ip1 +    movq 16(%rax), %ip2 +    movq 24(%rax), %ip3 +    movq 32(%rax), %op0 +    movq 40(%rax), %op1 +    movq 48(%rax), %op2 +    movq 56(%rax), %op3 +    movq 64(%rax), %bits0 +    movq 72(%rax), %bits1 +    movq 80(%rax), %bits2 +    movq 88(%rax), %bits3 +    movq 96(%rax), %dtable +    push %rax      /* argument */ +    push %rax      /* olimit */ +    push 104(%rax) /* ilimit */ + +    movq 112(%rax), %rax +    push %rax /* oend3 */ + +    movq %op3, %rax +    push %rax /* oend2 */ + +    movq %op2, %rax +    push %rax /* oend1 */ + +    movq %op1, %rax +    push %rax /* oend0 */ + +    /* Scratch space */ +    subq $8, %rsp + +.L_4X2_compute_olimit: +    /* Computes how many iterations we can do safely +     * %r15, %rax may be clobbered +     * rdx must be saved +     * op[1,2,3,4] & ip0 mustn't be clobbered +     */ +    movq %rdx, 0(%rsp) + +    /* We can consume up to 7 input bytes each iteration. */ +    movq %ip0,     %rax  /* rax = ip0 */ +    movq 40(%rsp), %rdx  /* rdx = ilimit */ +    subq %rdx,     %rax  /* rax = ip0 - ilimit */ +    movq %rax,    %r15   /* r15 = ip0 - ilimit */ + +    /* rdx = rax / 7 */ +    movabsq $2635249153387078803, %rdx +    mulq %rdx +    subq %rdx, %r15 +    shrq %r15 +    addq %r15, %rdx +    shrq $2, %rdx + +    /* r15 = (ip0 - ilimit) / 7 */ +    movq %rdx, %r15 + +    movabsq $-3689348814741910323, %rdx +    movq 8(%rsp), %rax /* rax = oend0 */ +    subq %op0,    %rax /* rax = oend0 - op0 */ +    mulq %rdx +    shrq $3,      %rdx /* rdx = rax / 10 */ + +    /* r15 = min(%rdx, %r15) */ +    cmpq  %rdx, %r15 +    cmova %rdx, %r15 + +    movabsq $-3689348814741910323, %rdx +    movq 16(%rsp), %rax /* rax = oend1 */ +    subq %op1,     %rax /* rax = oend1 - op1 */ +    mulq %rdx +    shrq $3,       %rdx /* rdx = rax / 10 */ + +    /* r15 = min(%rdx, %r15) */ +    cmpq  %rdx, %r15 +    cmova %rdx, %r15 + +    movabsq $-3689348814741910323, %rdx +    movq 24(%rsp), %rax /* rax = oend2 */ +    subq %op2,     %rax /* rax = oend2 - op2 */ +    mulq %rdx +    shrq $3,       %rdx /* rdx = rax / 10 */ + +    /* r15 = min(%rdx, %r15) */ +    cmpq  %rdx, %r15 +    cmova %rdx, %r15 + +    movabsq $-3689348814741910323, %rdx +    movq 32(%rsp), %rax /* rax = oend3 */ +    subq %op3,     %rax /* rax = oend3 - op3 */ +    mulq %rdx +    shrq $3,       %rdx /* rdx = rax / 10 */ + +    /* r15 = min(%rdx, %r15) */ +    cmpq  %rdx, %r15 +    cmova %rdx, %r15 + +    /* olimit = op3 + 5 * r15 */ +    movq %r15, %rax +    leaq (%op3, %rax, 4), %olimit +    addq %rax, %olimit + +    movq 0(%rsp), %rdx + +    /* If (op3 + 10 > olimit) */ +    movq %op3, %rax    /* rax = op3 */ +    addq $10,  %rax    /* rax = op3 + 10 */ +    cmpq %rax, %olimit /* op3 + 10 > olimit */ +    jb .L_4X2_exit + +    /* If (ip1 < ip0) go to exit */ +    cmpq %ip0, %ip1 +    jb .L_4X2_exit + +    /* If (ip2 < ip1) go to exit */ +    cmpq %ip1, %ip2 +    jb .L_4X2_exit + +    /* If (ip3 < ip2) go to exit */ +    cmpq %ip2, %ip3 +    jb .L_4X2_exit + +#define DECODE(n, idx)              \ +    movq %bits##n, %rax;            \ +    shrq $53, %rax;                 \ +    movzwl 0(%dtable,%rax,4),%r8d;  \ +    movzbl 2(%dtable,%rax,4),%r15d; \ +    movzbl 3(%dtable,%rax,4),%eax;  \ +    movw %r8w, (%op##n);            \ +    shlxq %r15, %bits##n, %bits##n; \ +    addq %rax, %op##n + +#define RELOAD_BITS(n)              \ +    bsfq %bits##n, %bits##n;        \ +    movq %bits##n, %rax;            \ +    shrq $3, %bits##n;              \ +    andq $7, %rax;                  \ +    subq %bits##n, %ip##n;          \ +    movq (%ip##n), %bits##n;        \ +    orq $1, %bits##n;               \ +    shlxq %rax, %bits##n, %bits##n + + +    movq %olimit, 48(%rsp) + +    .p2align 6 + +.L_4X2_loop_body: +    /* We clobber r8, so store it on the stack */ +    movq %r8, 0(%rsp) + +    /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ +    FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) +    FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) +    FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) +    FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) +    FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) + +    /* Reload r8 */ +    movq 0(%rsp), %r8 + +    FOR_EACH_STREAM(RELOAD_BITS) + +    cmp %op3, 48(%rsp) +    ja .L_4X2_loop_body +    jmp .L_4X2_compute_olimit + +#undef DECODE +#undef RELOAD_BITS +.L_4X2_exit: +    addq $8, %rsp +    /* Restore stack (oend & olimit) */ +    pop %rax /* oend0 */ +    pop %rax /* oend1 */ +    pop %rax /* oend2 */ +    pop %rax /* oend3 */ +    pop %rax /* ilimit */ +    pop %rax /* olimit */ +    pop %rax /* arg */ + +    /* Save ip / op / bits */ +    movq %ip0,  0(%rax) +    movq %ip1,  8(%rax) +    movq %ip2, 16(%rax) +    movq %ip3, 24(%rax) +    movq %op0, 32(%rax) +    movq %op1, 40(%rax) +    movq %op2, 48(%rax) +    movq %op3, 56(%rax) +    movq %bits0, 64(%rax) +    movq %bits1, 72(%rax) +    movq %bits2, 80(%rax) +    movq %bits3, 88(%rax) + +    /* Restore registers */ +    pop %r15 +    pop %r14 +    pop %r13 +    pop %r12 +    pop %r11 +    pop %r10 +    pop %r9 +    pop %r8 +    pop %rdi +    pop %rsi +    pop %rbp +    pop %rdx +    pop %rcx +    pop %rbx +    pop %rax +    ret + +#endif diff --git a/thirdparty/zstd/decompress/zstd_decompress.c b/thirdparty/zstd/decompress/zstd_decompress.c index 910bc034c0..0031e98cfb 100644 --- a/thirdparty/zstd/decompress/zstd_decompress.c +++ b/thirdparty/zstd/decompress/zstd_decompress.c @@ -56,7 +56,6 @@  *  Dependencies  *********************************************************/  #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ -#include "../common/cpu.h"         /* bmi2 */  #include "../common/mem.h"         /* low level memory routines */  #define FSE_STATIC_LINKING_ONLY  #include "../common/fse.h" @@ -177,12 +176,15 @@ static const ZSTD_DDict* ZSTD_DDictHashSet_getDDict(ZSTD_DDictHashSet* hashSet,  static ZSTD_DDictHashSet* ZSTD_createDDictHashSet(ZSTD_customMem customMem) {      ZSTD_DDictHashSet* ret = (ZSTD_DDictHashSet*)ZSTD_customMalloc(sizeof(ZSTD_DDictHashSet), customMem);      DEBUGLOG(4, "Allocating new hash set"); +    if (!ret) +        return NULL;      ret->ddictPtrTable = (const ZSTD_DDict**)ZSTD_customCalloc(DDICT_HASHSET_TABLE_BASE_SIZE * sizeof(ZSTD_DDict*), customMem); -    ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; -    ret->ddictPtrCount = 0; -    if (!ret || !ret->ddictPtrTable) { +    if (!ret->ddictPtrTable) { +        ZSTD_customFree(ret, customMem);          return NULL;      } +    ret->ddictPtrTableSize = DDICT_HASHSET_TABLE_BASE_SIZE; +    ret->ddictPtrCount = 0;      return ret;  } @@ -255,11 +257,15 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)      dctx->inBuffSize  = 0;      dctx->outBuffSize = 0;      dctx->streamStage = zdss_init; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)      dctx->legacyContext = NULL;      dctx->previousLegacyVersion = 0; +#endif      dctx->noForwardProgress = 0;      dctx->oversizedDuration = 0; -    dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#if DYNAMIC_BMI2 +    dctx->bmi2 = ZSTD_cpuSupportsBmi2(); +#endif      dctx->ddictSet = NULL;      ZSTD_DCtx_resetParameters(dctx);  #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION @@ -280,8 +286,7 @@ ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize)      return dctx;  } -ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) -{ +static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) {      if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;      {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dctx), customMem); @@ -292,10 +297,15 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)      }  } +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ +    return ZSTD_createDCtx_internal(customMem); +} +  ZSTD_DCtx* ZSTD_createDCtx(void)  {      DEBUGLOG(3, "ZSTD_createDCtx"); -    return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); +    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);  }  static void ZSTD_clearDict(ZSTD_DCtx* dctx) @@ -380,6 +390,19 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size)      return 0;  } +/*! ZSTD_isSkippableFrame() : + *  Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + */ +unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) +{ +    if (size < ZSTD_FRAMEIDSIZE) return 0; +    {   U32 const magic = MEM_readLE32(buffer); +        if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; +    } +    return 0; +} +  /** ZSTD_frameHeaderSize_internal() :   *  srcSize must be large enough to reach header size fields.   *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. @@ -466,7 +489,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s          }          switch(dictIDSizeCode)          { -            default: assert(0);  /* impossible */ +            default: +                assert(0);  /* impossible */ +                ZSTD_FALLTHROUGH;              case 0 : break;              case 1 : dictID = ip[pos]; pos++; break;              case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; @@ -474,7 +499,9 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s          }          switch(fcsID)          { -            default: assert(0);  /* impossible */ +            default: +                assert(0);  /* impossible */ +                ZSTD_FALLTHROUGH;              case 0 : if (singleSegment) frameContentSize = ip[pos]; break;              case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;              case 2 : frameContentSize = MEM_readLE32(ip+pos); break; @@ -503,7 +530,6 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src      return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);  } -  /** ZSTD_getFrameContentSize() :   *  compatible with legacy mode   * @return : decompressed size of the single frame pointed to be `src` if known, otherwise @@ -544,6 +570,37 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)      }  } +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +                                            const void* src, size_t srcSize) +{ +    U32 const magicNumber = MEM_readLE32(src); +    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; + +    /* check input validity */ +    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); + +    /* deliver payload */ +    if (skippableContentSize > 0  && dst != NULL) +        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +    if (magicVariant != NULL) +        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +    return skippableContentSize; +} +  /** ZSTD_findDecompressedSize() :   *  compatible with legacy mode   *  `srcSize` must be the exact length of some number of ZSTD compressed and/or @@ -858,7 +915,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,          switch(blockProperties.blockType)          {          case bt_compressed: -            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1); +            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming);              break;          case bt_raw :              decodedSize = ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, cBlockSize); @@ -1009,7 +1066,7 @@ static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx)      switch (dctx->dictUses) {      default:          assert(0 /* Impossible */); -        /* fall-through */ +        ZSTD_FALLTHROUGH;      case ZSTD_dont_use:          ZSTD_clearDict(dctx);          return NULL; @@ -1031,7 +1088,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr  {  #if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1)      size_t regenSize; -    ZSTD_DCtx* const dctx = ZSTD_createDCtx(); +    ZSTD_DCtx* const dctx =  ZSTD_createDCtx_internal(ZSTD_defaultCMem);      RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!");      regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);      ZSTD_freeDCtx(dctx); @@ -1065,7 +1122,7 @@ static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t          return dctx->expected;      if (dctx->bType != bt_raw)          return dctx->expected; -    return MIN(MAX(inputSize, 1), dctx->expected); +    return BOUNDED(1, inputSize, dctx->expected);  }  ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { @@ -1073,7 +1130,9 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {      {      default:   /* should not happen */          assert(0); +        ZSTD_FALLTHROUGH;      case ZSTDds_getFrameHeaderSize: +        ZSTD_FALLTHROUGH;      case ZSTDds_decodeFrameHeader:          return ZSTDnit_frameHeader;      case ZSTDds_decodeBlockHeader: @@ -1085,6 +1144,7 @@ ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {      case ZSTDds_checkChecksum:          return ZSTDnit_checksum;      case ZSTDds_decodeSkippableHeader: +        ZSTD_FALLTHROUGH;      case ZSTDds_skipFrame:          return ZSTDnit_skippableFrame;      } @@ -1168,7 +1228,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c              {              case bt_compressed:                  DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); -                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); +                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);                  dctx->expected = 0;  /* Streaming not supported */                  break;              case bt_raw : @@ -1493,7 +1553,7 @@ size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,  ZSTD_DStream* ZSTD_createDStream(void)  {      DEBUGLOG(3, "ZSTD_createDStream"); -    return ZSTD_createDStream_advanced(ZSTD_defaultCMem); +    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);  }  ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) @@ -1503,7 +1563,7 @@ ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)  ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)  { -    return ZSTD_createDCtx_advanced(customMem); +    return ZSTD_createDCtx_internal(customMem);  }  size_t ZSTD_freeDStream(ZSTD_DStream* zds) @@ -1763,7 +1823,8 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)  size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)  {      size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); -    unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); +    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ +    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);      unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);      size_t const minRBSize = (size_t) neededSize;      RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, @@ -1897,10 +1958,12 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB              DEBUGLOG(5, "stage zdss_init => transparent reset ");              zds->streamStage = zdss_loadHeader;              zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)              zds->legacyVersion = 0; +#endif              zds->hostageByte = 0;              zds->expectedOutBuffer = *output; -            /* fall-through */ +            ZSTD_FALLTHROUGH;          case zdss_loadHeader :              DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); @@ -2038,7 +2101,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB                          zds->outBuffSize = neededOutBuffSize;              }   }   }              zds->streamStage = zdss_read; -            /* fall-through */ +            ZSTD_FALLTHROUGH;          case zdss_read:              DEBUGLOG(5, "stage zdss_read"); @@ -2057,7 +2120,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB              }   }              if (ip==iend) { someMoreWork = 0; break; }   /* no more input */              zds->streamStage = zdss_load; -            /* fall-through */ +            ZSTD_FALLTHROUGH;          case zdss_load:              {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); diff --git a/thirdparty/zstd/decompress/zstd_decompress_block.c b/thirdparty/zstd/decompress/zstd_decompress_block.c index 349dcdc333..2e44d30d2f 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_block.c +++ b/thirdparty/zstd/decompress/zstd_decompress_block.c @@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,      }  } +/* Allocate buffer for literals, either overlapping current dst, or split between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ +static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, +    const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) +{ +    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) +    { +        /* room for litbuffer to fit without read faulting */ +        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; +        dctx->litBufferEnd = dctx->litBuffer + litSize; +        dctx->litBufferLocation = ZSTD_in_dst; +    } +    else if (litSize > ZSTD_LITBUFFEREXTRASIZE) +    { +        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ +        if (splitImmediately) { +            /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ +            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; +            dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; +        } +        else { +            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ +            dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; +            dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; +        } +        dctx->litBufferLocation = ZSTD_split; +    } +    else +    { +        /* fits entirely within litExtraBuffer, so no split is necessary */ +        dctx->litBuffer = dctx->litExtraBuffer; +        dctx->litBufferEnd = dctx->litBuffer + litSize; +        dctx->litBufferLocation = ZSTD_not_in_dst; +    } +}  /* Hidden declaration for fullbench */  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, -                          const void* src, size_t srcSize); +                          const void* src, size_t srcSize, +                          void* dst, size_t dstCapacity, const streaming_operation streaming);  /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current + * block will be output.  Otherwise it will be stored at the end of the current dst blockspace, with a small portion being + * stored in dctx->litExtraBuffer to help keep it "ahead" of the current output write. + *   * @return : nb of bytes read from src (< srcSize )   *  note : symbol not declared but exposed for fullbench */  size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, -                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */ +                          const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */ +                          void* dst, size_t dstCapacity, const streaming_operation streaming)  {      DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");      RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); @@ -90,7 +131,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,          case set_repeat:              DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");              RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); -            /* fall-through */ +            ZSTD_FALLTHROUGH;          case set_compressed:              RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); @@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,                  U32 const lhlCode = (istart[0] >> 2) & 3;                  U32 const lhc = MEM_readLE32(istart);                  size_t hufSuccess; +                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);                  switch(lhlCode)                  {                  case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */ @@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,                      litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);                      break;                  } +                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");                  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");                  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); +                RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); +                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);                  /* prefetch huffman table if cold */                  if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { @@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,                      if (singleStream) {                          hufSuccess = HUF_decompress1X_usingDTable_bmi2(                              dctx->litBuffer, litSize, istart+lhSize, litCSize, -                            dctx->HUFptr, dctx->bmi2); +                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));                      } else {                          hufSuccess = HUF_decompress4X_usingDTable_bmi2(                              dctx->litBuffer, litSize, istart+lhSize, litCSize, -                            dctx->HUFptr, dctx->bmi2); +                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));                      }                  } else {                      if (singleStream) { @@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,                          hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(                              dctx->entropy.hufTable, dctx->litBuffer, litSize,                              istart+lhSize, litCSize, dctx->workspace, -                            sizeof(dctx->workspace), dctx->bmi2); +                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));  #endif                      } else {                          hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(                              dctx->entropy.hufTable, dctx->litBuffer, litSize,                              istart+lhSize, litCSize, dctx->workspace, -                            sizeof(dctx->workspace), dctx->bmi2); +                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));                      }                  } +                if (dctx->litBufferLocation == ZSTD_split) +                { +                    ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); +                    ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); +                    dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; +                    dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; +                }                  RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); @@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,                  dctx->litSize = litSize;                  dctx->litEntropy = 1;                  if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; -                ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);                  return litCSize + lhSize;              }          case set_basic:              {   size_t litSize, lhSize;                  U32 const lhlCode = ((istart[0]) >> 2) & 3; +                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);                  switch(lhlCode)                  {                  case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */ @@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,                      break;                  } +                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); +                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);                  if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */                      RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); -                    ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize); +                    if (dctx->litBufferLocation == ZSTD_split) +                    { +                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize - ZSTD_LITBUFFEREXTRASIZE); +                        ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); +                    } +                    else +                    { +                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litSize); +                    }                      dctx->litPtr = dctx->litBuffer;                      dctx->litSize = litSize; -                    ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);                      return lhSize+litSize;                  }                  /* direct reference into compressed stream */                  dctx->litPtr = istart+lhSize;                  dctx->litSize = litSize; +                dctx->litBufferEnd = dctx->litPtr + litSize; +                dctx->litBufferLocation = ZSTD_not_in_dst;                  return lhSize+litSize;              }          case set_rle:              {   U32 const lhlCode = ((istart[0]) >> 2) & 3;                  size_t litSize, lhSize; +                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);                  switch(lhlCode)                  {                  case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */ @@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,                      RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");                      break;                  } +                RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");                  RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); -                ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); +                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); +                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); +                if (dctx->litBufferLocation == ZSTD_split) +                { +                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize - ZSTD_LITBUFFEREXTRASIZE); +                    ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD_LITBUFFEREXTRASIZE); +                } +                else +                { +                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); +                }                  dctx->litPtr = dctx->litBuffer;                  dctx->litSize = litSize;                  return lhSize+1; @@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {  };   /* ML_defaultDTable */ -static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits) +static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 nbAddBits)  {      void* ptr = dt;      ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr; @@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB      cell->nbBits = 0;      cell->nextState = 0;      assert(nbAddBits < 255); -    cell->nbAdditionalBits = (BYTE)nbAddBits; +    cell->nbAdditionalBits = nbAddBits;      cell->baseValue = baseValue;  } @@ -367,7 +443,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddB  FORCE_INLINE_TEMPLATE  void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,              const short* normalizedCounter, unsigned maxSymbolValue, -            const U32* baseValue, const U32* nbAdditionalBits, +            const U32* baseValue, const U8* nbAdditionalBits,              unsigned tableLog, void* wksp, size_t wkspSize)  {      ZSTD_seqSymbol* const tableDecode = dt+1; @@ -478,7 +554,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,              tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );              tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);              assert(nbAdditionalBits[symbol] < 255); -            tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol]; +            tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];              tableDecode[u].baseValue = baseValue[symbol];          }      } @@ -487,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,  /* Avoids the FORCE_INLINE of the _body() function. */  static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,              const short* normalizedCounter, unsigned maxSymbolValue, -            const U32* baseValue, const U32* nbAdditionalBits, +            const U32* baseValue, const U8* nbAdditionalBits,              unsigned tableLog, void* wksp, size_t wkspSize)  {      ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -495,9 +571,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,  }  #if DYNAMIC_BMI2 -TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt, +BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol* dt,              const short* normalizedCounter, unsigned maxSymbolValue, -            const U32* baseValue, const U32* nbAdditionalBits, +            const U32* baseValue, const U8* nbAdditionalBits,              unsigned tableLog, void* wksp, size_t wkspSize)  {      ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -507,7 +583,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSymbol  void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,              const short* normalizedCounter, unsigned maxSymbolValue, -            const U32* baseValue, const U32* nbAdditionalBits, +            const U32* baseValue, const U8* nbAdditionalBits,              unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)  {  #if DYNAMIC_BMI2 @@ -529,7 +605,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,  static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,                                   symbolEncodingType_e type, unsigned max, U32 maxLog,                                   const void* src, size_t srcSize, -                                 const U32* baseValue, const U32* nbAdditionalBits, +                                 const U32* baseValue, const U8* nbAdditionalBits,                                   const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,                                   int ddictIsCold, int nbSeq, U32* wksp, size_t wkspSize,                                   int bmi2) @@ -541,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymb          RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");          {   U32 const symbol = *(const BYTE*)src;              U32 const baseline = baseValue[symbol]; -            U32 const nbBits = nbAdditionalBits[symbol]; +            U8 const nbBits = nbAdditionalBits[symbol];              ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);          }          *DTablePtr = DTableSpace; @@ -620,7 +696,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,                                                        LL_defaultDTable, dctx->fseEntropy,                                                        dctx->ddictIsCold, nbSeq,                                                        dctx->workspace, sizeof(dctx->workspace), -                                                      dctx->bmi2); +                                                      ZSTD_DCtx_get_bmi2(dctx));              RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");              ip += llhSize;          } @@ -632,7 +708,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,                                                        OF_defaultDTable, dctx->fseEntropy,                                                        dctx->ddictIsCold, nbSeq,                                                        dctx->workspace, sizeof(dctx->workspace), -                                                      dctx->bmi2); +                                                      ZSTD_DCtx_get_bmi2(dctx));              RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");              ip += ofhSize;          } @@ -644,7 +720,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,                                                        ML_defaultDTable, dctx->fseEntropy,                                                        dctx->ddictIsCold, nbSeq,                                                        dctx->workspace, sizeof(dctx->workspace), -                                                      dctx->bmi2); +                                                      ZSTD_DCtx_get_bmi2(dctx));              RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");              ip += mlhSize;          } @@ -713,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {   *         - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.   *           The src buffer must be before the dst buffer.   */ -static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { +static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {      ptrdiff_t const diff = op - ip;      BYTE* const oend = op + length; @@ -729,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_          /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */          assert(length >= 8);          ZSTD_overlapCopy8(&op, &ip, diff); +        length -= 8;          assert(op - ip >= 8);          assert(op <= oend);      } @@ -743,12 +820,35 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_          assert(oend > oend_w);          ZSTD_wildcopy(op, ip, oend_w - op, ovtype);          ip += oend_w - op; -        op = oend_w; +        op += oend_w - op;      }      /* Handle the leftovers. */      while (op < oend) *op++ = *ip++;  } +/* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { +    ptrdiff_t const diff = op - ip; +    BYTE* const oend = op + length; + +    if (length < 8 || diff > -8) { +        /* Handle short lengths, close overlaps, and dst not before src. */ +        while (op < oend) *op++ = *ip++; +        return; +    } + +    if (op <= oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { +        ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_overlap); +        ip += oend - WILDCOPY_OVERLENGTH - op; +        op += oend - WILDCOPY_OVERLENGTH - op; +    } + +    /* Handle the leftovers. */ +    while (op < oend) *op++ = *ip++; +} +  /* ZSTD_execSequenceEnd():   * This version handles cases that are near the end of the output buffer. It requires   * more careful checks to make sure there is no overflow. By separating out these hard @@ -759,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_   */  FORCE_NOINLINE  size_t ZSTD_execSequenceEnd(BYTE* op, -                            BYTE* const oend, seq_t sequence, -                            const BYTE** litPtr, const BYTE* const litLimit, -                            const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +    BYTE* const oend, seq_t sequence, +    const BYTE** litPtr, const BYTE* const litLimit, +    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)  {      BYTE* const oLitEnd = op + sequence.litLength;      size_t const sequenceLength = sequence.litLength + sequence.matchLength; @@ -784,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op,      if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {          /* offset beyond prefix */          RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); -        match = dictEnd - (prefixStart-match); +        match = dictEnd - (prefixStart - match);          if (match + sequence.matchLength <= dictEnd) {              ZSTD_memmove(oLitEnd, match, sequence.matchLength);              return sequenceLength;          }          /* span extDict & currentPrefixSegment */          {   size_t const length1 = dictEnd - match; -            ZSTD_memmove(oLitEnd, match, length1); -            op = oLitEnd + length1; -            sequence.matchLength -= length1; -            match = prefixStart; -    }   } +        ZSTD_memmove(oLitEnd, match, length1); +        op = oLitEnd + length1; +        sequence.matchLength -= length1; +        match = prefixStart; +        } +    } +    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); +    return sequenceLength; +} + +/* ZSTD_execSequenceEndSplitLitBuffer(): + * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case. + */ +FORCE_NOINLINE +size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, +    BYTE* const oend, const BYTE* const oend_w, seq_t sequence, +    const BYTE** litPtr, const BYTE* const litLimit, +    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ +    BYTE* const oLitEnd = op + sequence.litLength; +    size_t const sequenceLength = sequence.litLength + sequence.matchLength; +    const BYTE* const iLitEnd = *litPtr + sequence.litLength; +    const BYTE* match = oLitEnd - sequence.offset; + + +    /* bounds checks : careful of address space overflow in 32-bit mode */ +    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); +    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); +    assert(op < op + sequenceLength); +    assert(oLitEnd < op + sequenceLength); + +    /* copy literals */ +    RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dstSize_tooSmall, "output should not catch up to and overwrite literal buffer"); +    ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); +    op = oLitEnd; +    *litPtr = iLitEnd; + +    /* copy Match */ +    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { +        /* offset beyond prefix */ +        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); +        match = dictEnd - (prefixStart - match); +        if (match + sequence.matchLength <= dictEnd) { +            ZSTD_memmove(oLitEnd, match, sequence.matchLength); +            return sequenceLength; +        } +        /* span extDict & currentPrefixSegment */ +        {   size_t const length1 = dictEnd - match; +        ZSTD_memmove(oLitEnd, match, length1); +        op = oLitEnd + length1; +        sequence.matchLength -= length1; +        match = prefixStart; +        } +    }      ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);      return sequenceLength;  }  HINT_INLINE  size_t ZSTD_execSequence(BYTE* op, -                         BYTE* const oend, seq_t sequence, -                         const BYTE** litPtr, const BYTE* const litLimit, -                         const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +    BYTE* const oend, seq_t sequence, +    const BYTE** litPtr, const BYTE* const litLimit, +    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)  {      BYTE* const oLitEnd = op + sequence.litLength;      size_t const sequenceLength = sequence.litLength + sequence.matchLength; @@ -821,10 +970,102 @@ size_t ZSTD_execSequence(BYTE* op,       *   - 32-bit mode and the match length overflows       */      if (UNLIKELY( +        iLitEnd > litLimit || +        oMatchEnd > oend_w || +        (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) +        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + +    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ +    assert(op <= oLitEnd /* No overflow */); +    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); +    assert(oMatchEnd <= oend /* No underflow */); +    assert(iLitEnd <= litLimit /* Literal length is in bounds */); +    assert(oLitEnd <= oend_w /* Can wildcopy literals */); +    assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + +    /* Copy Literals: +     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. +     * We likely don't need the full 32-byte wildcopy. +     */ +    assert(WILDCOPY_OVERLENGTH >= 16); +    ZSTD_copy16(op, (*litPtr)); +    if (UNLIKELY(sequence.litLength > 16)) { +        ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZSTD_no_overlap); +    } +    op = oLitEnd; +    *litPtr = iLitEnd;   /* update for next sequence */ + +    /* Copy Match */ +    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { +        /* offset beyond prefix -> go into extDict */ +        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); +        match = dictEnd + (match - prefixStart); +        if (match + sequence.matchLength <= dictEnd) { +            ZSTD_memmove(oLitEnd, match, sequence.matchLength); +            return sequenceLength; +        } +        /* span extDict & currentPrefixSegment */ +        {   size_t const length1 = dictEnd - match; +        ZSTD_memmove(oLitEnd, match, length1); +        op = oLitEnd + length1; +        sequence.matchLength -= length1; +        match = prefixStart; +        } +    } +    /* Match within prefix of 1 or more bytes */ +    assert(op <= oMatchEnd); +    assert(oMatchEnd <= oend_w); +    assert(match >= prefixStart); +    assert(sequence.matchLength >= 1); + +    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy +     * without overlap checking. +     */ +    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { +        /* We bet on a full wildcopy for matches, since we expect matches to be +         * longer than literals (in general). In silesia, ~10% of matches are longer +         * than 16 bytes. +         */ +        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); +        return sequenceLength; +    } +    assert(sequence.offset < WILDCOPY_VECLEN); + +    /* Copy 8 bytes and spread the offset to be >= 8. */ +    ZSTD_overlapCopy8(&op, &match, sequence.offset); + +    /* If the match length is > 8 bytes, then continue with the wildcopy. */ +    if (sequence.matchLength > 8) { +        assert(op < oMatchEnd); +        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD_overlap_src_before_dst); +    } +    return sequenceLength; +} + +HINT_INLINE +size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, +    BYTE* const oend, const BYTE* const oend_w, seq_t sequence, +    const BYTE** litPtr, const BYTE* const litLimit, +    const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ +    BYTE* const oLitEnd = op + sequence.litLength; +    size_t const sequenceLength = sequence.litLength + sequence.matchLength; +    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */ +    const BYTE* const iLitEnd = *litPtr + sequence.litLength; +    const BYTE* match = oLitEnd - sequence.offset; + +    assert(op != NULL /* Precondition */); +    assert(oend_w < oend /* No underflow */); +    /* Handle edge cases in a slow path: +     *   - Read beyond end of literals +     *   - Match end is within WILDCOPY_OVERLIMIT of oend +     *   - 32-bit mode and the match length overflows +     */ +    if (UNLIKELY(              iLitEnd > litLimit ||              oMatchEnd > oend_w ||              (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) -        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); +        return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);      /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */      assert(op <= oLitEnd /* No overflow */); @@ -892,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op,      return sequenceLength;  } +  static void  ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)  { @@ -905,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqS  }  FORCE_INLINE_TEMPLATE void -ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) -{ -    ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; -    U32 const nbBits = DInfo.nbBits; -    size_t const lowBits = BIT_readBits(bitD, nbBits); -    DStatePtr->state = DInfo.nextState + lowBits; -} - -FORCE_INLINE_TEMPLATE void -ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 nextState, U32 nbBits)  { -    U32 const nbBits = DInfo.nbBits;      size_t const lowBits = BIT_readBits(bitD, nbBits); -    DStatePtr->state = DInfo.nextState + lowBits; +    DStatePtr->state = nextState + lowBits;  }  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum @@ -937,102 +1169,100 @@ FORCE_INLINE_TEMPLATE seq_t  ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)  {      seq_t seq; -    ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; -    ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; -    ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; -    U32 const llBase = llDInfo.baseValue; -    U32 const mlBase = mlDInfo.baseValue; -    U32 const ofBase = ofDInfo.baseValue; -    BYTE const llBits = llDInfo.nbAdditionalBits; -    BYTE const mlBits = mlDInfo.nbAdditionalBits; -    BYTE const ofBits = ofDInfo.nbAdditionalBits; -    BYTE const totalBits = llBits+mlBits+ofBits; - -    /* sequence */ -    {   size_t offset; -        if (ofBits > 1) { -            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); -            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); -            assert(ofBits <= MaxOff); -            if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { -                U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); -                offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); -                BIT_reloadDStream(&seqState->DStream); -                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); -                assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */ -            } else { -                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */ -                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); -            } -            seqState->prevOffset[2] = seqState->prevOffset[1]; -            seqState->prevOffset[1] = seqState->prevOffset[0]; -            seqState->prevOffset[0] = offset; -        } else { -            U32 const ll0 = (llBase == 0); -            if (LIKELY((ofBits == 0))) { -                if (LIKELY(!ll0)) -                    offset = seqState->prevOffset[0]; -                else { -                    offset = seqState->prevOffset[1]; -                    seqState->prevOffset[1] = seqState->prevOffset[0]; -                    seqState->prevOffset[0] = offset; +    const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; +    const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; +    const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; +    seq.matchLength = mlDInfo->baseValue; +    seq.litLength = llDInfo->baseValue; +    {   U32 const ofBase = ofDInfo->baseValue; +        BYTE const llBits = llDInfo->nbAdditionalBits; +        BYTE const mlBits = mlDInfo->nbAdditionalBits; +        BYTE const ofBits = ofDInfo->nbAdditionalBits; +        BYTE const totalBits = llBits+mlBits+ofBits; + +        U16 const llNext = llDInfo->nextState; +        U16 const mlNext = mlDInfo->nextState; +        U16 const ofNext = ofDInfo->nextState; +        U32 const llnbBits = llDInfo->nbBits; +        U32 const mlnbBits = mlDInfo->nbBits; +        U32 const ofnbBits = ofDInfo->nbBits; +        /* +         * As gcc has better branch and block analyzers, sometimes it is only +         * valuable to mark likelyness for clang, it gives around 3-4% of +         * performance. +         */ + +        /* sequence */ +        {   size_t offset; +    #if defined(__clang__) +            if (LIKELY(ofBits > 1)) { +    #else +            if (ofBits > 1) { +    #endif +                ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); +                ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +                assert(ofBits <= MaxOff); +                if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); +                    offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); +                    BIT_reloadDStream(&seqState->DStream); +                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */ +                } else { +                    offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */ +                    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);                  } +                seqState->prevOffset[2] = seqState->prevOffset[1]; +                seqState->prevOffset[1] = seqState->prevOffset[0]; +                seqState->prevOffset[0] = offset;              } else { -                offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); -                {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; -                    temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */ -                    if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; -                    seqState->prevOffset[1] = seqState->prevOffset[0]; -                    seqState->prevOffset[0] = offset = temp; -        }   }   } -        seq.offset = offset; -    } - -    seq.matchLength = mlBase; -    if (mlBits > 0) -        seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); - -    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) -        BIT_reloadDStream(&seqState->DStream); -    if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) -        BIT_reloadDStream(&seqState->DStream); -    /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ -    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - -    seq.litLength = llBase; -    if (llBits > 0) -        seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); - -    if (MEM_32bits()) -        BIT_reloadDStream(&seqState->DStream); - -    DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", -                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - -    /* ANS state update -     * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). -     * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). -     * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the -     * better option, so it is the default for other compilers. But, if you -     * measure that it is worse, please put up a pull request. -     */ -    { -#if defined(__GNUC__) && !defined(__clang__) -        const int kUseUpdateFseState = 1; -#else -        const int kUseUpdateFseState = 0; -#endif -        if (kUseUpdateFseState) { -            ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */ -            ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */ -            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */ -            ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */ -        } else { -            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo);    /* <=  9 bits */ -            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo);    /* <=  9 bits */ -            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */ -            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo);  /* <=  8 bits */ +                U32 const ll0 = (llDInfo->baseValue == 0); +                if (LIKELY((ofBits == 0))) { +                    offset = seqState->prevOffset[ll0]; +                    seqState->prevOffset[1] = seqState->prevOffset[!ll0]; +                    seqState->prevOffset[0] = offset; +                } else { +                    offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); +                    {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; +                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */ +                        if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; +                        seqState->prevOffset[1] = seqState->prevOffset[0]; +                        seqState->prevOffset[0] = offset = temp; +            }   }   } +            seq.offset = offset;          } + +    #if defined(__clang__) +        if (UNLIKELY(mlBits > 0)) +    #else +        if (mlBits > 0) +    #endif +            seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + +        if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +            BIT_reloadDStream(&seqState->DStream); +        if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) +            BIT_reloadDStream(&seqState->DStream); +        /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ +        ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +    #if defined(__clang__) +        if (UNLIKELY(llBits > 0)) +    #else +        if (llBits > 0) +    #endif +            seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + +        if (MEM_32bits()) +            BIT_reloadDStream(&seqState->DStream); + +        DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", +                    (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + +        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */ +        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */ +        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */ +        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */      }      return seq; @@ -1085,9 +1315,11 @@ MEM_STATIC void ZSTD_assertValidSequence(  #endif  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + +  FORCE_INLINE_TEMPLATE size_t  DONT_VECTORIZE -ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, +ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,                                 void* dst, size_t maxDstSize,                           const void* seqStart, size_t seqSize, int nbSeq,                           const ZSTD_longOffset_e isLongOffset, @@ -1099,11 +1331,11 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,      BYTE* const oend = ostart + maxDstSize;      BYTE* op = ostart;      const BYTE* litPtr = dctx->litPtr; -    const BYTE* const litEnd = litPtr + dctx->litSize; +    const BYTE* litBufferEnd = dctx->litBufferEnd;      const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);      const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);      const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); -    DEBUGLOG(5, "ZSTD_decompressSequences_body"); +    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");      (void)frame;      /* Regen sequences */ @@ -1124,55 +1356,237 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,                  BIT_DStream_endOfBuffer < BIT_DStream_completed &&                  BIT_DStream_completed < BIT_DStream_overflow); +        /* decompress without overrunning litPtr begins */ +        { +            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +            /* Align the decompression loop to 32 + 16 bytes. +                * +                * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression +                * speed swings based on the alignment of the decompression loop. This +                * performance swing is caused by parts of the decompression loop falling +                * out of the DSB. The entire decompression loop should fit in the DSB, +                * when it can't we get much worse performance. You can measure if you've +                * hit the good case or the bad case with this perf command for some +                * compressed file test.zst: +                * +                *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ +                *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst +                * +                * If you see most cycles served out of the MITE you've hit the bad case. +                * If you see most cycles served out of the DSB you've hit the good case. +                * If it is pretty even then you may be in an okay case. +                * +                * This issue has been reproduced on the following CPUs: +                *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 +                *               Use Instruments->Counters to get DSB/MITE cycles. +                *               I never got performance swings, but I was able to +                *               go from the good case of mostly DSB to half of the +                *               cycles served from MITE. +                *   - Coffeelake: Intel i9-9900k +                *   - Coffeelake: Intel i7-9700k +                * +                * I haven't been able to reproduce the instability or DSB misses on any +                * of the following CPUS: +                *   - Haswell +                *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH +                *   - Skylake +                * +                * Alignment is done for each of the three major decompression loops: +                *   - ZSTD_decompressSequences_bodySplitLitBuffer - presplit section of the literal buffer +                *   - ZSTD_decompressSequences_bodySplitLitBuffer - postsplit section of the literal buffer +                *   - ZSTD_decompressSequences_body +                * Alignment choices are made to minimize large swings on bad cases and influence on performance +                * from changes external to this code, rather than to overoptimize on the current commit. +                * +                * If you are seeing performance stability this script can help test. +                * It tests on 4 commits in zstd where I saw performance change. +                * +                *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 +                */  #if defined(__GNUC__) && defined(__x86_64__) -        /* Align the decompression loop to 32 + 16 bytes. -         * -         * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression -         * speed swings based on the alignment of the decompression loop. This -         * performance swing is caused by parts of the decompression loop falling -         * out of the DSB. The entire decompression loop should fit in the DSB, -         * when it can't we get much worse performance. You can measure if you've -         * hit the good case or the bad case with this perf command for some -         * compressed file test.zst: -         * -         *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ -         *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst -         * -         * If you see most cycles served out of the MITE you've hit the bad case. -         * If you see most cycles served out of the DSB you've hit the good case. -         * If it is pretty even then you may be in an okay case. -         * -         * This issue has been reproduced on the following CPUs: -         *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 -         *               Use Instruments->Counters to get DSB/MITE cycles. -         *               I never got performance swings, but I was able to -         *               go from the good case of mostly DSB to half of the -         *               cycles served from MITE. -         *   - Coffeelake: Intel i9-9900k -         *   - Coffeelake: Intel i7-9700k -         * -         * I haven't been able to reproduce the instability or DSB misses on any -         * of the following CPUS: -         *   - Haswell -         *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH -         *   - Skylake -         * -         * If you are seeing performance stability this script can help test. -         * It tests on 4 commits in zstd where I saw performance change. -         * -         *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 -         */ -        __asm__(".p2align 6"); -        __asm__("nop"); -        __asm__(".p2align 5"); -        __asm__("nop"); -#  if __GNUC__ >= 9 -        /* better for gcc-9 and gcc-10, worse for clang and gcc-8 */ -        __asm__(".p2align 3"); +            __asm__(".p2align 6"); +#  if __GNUC__ >= 7 +	    /* good for gcc-7, gcc-9, and gcc-11 */ +            __asm__("nop"); +            __asm__(".p2align 5"); +            __asm__("nop"); +            __asm__(".p2align 4"); +#    if __GNUC__ == 8 || __GNUC__ == 10 +	    /* good for gcc-8 and gcc-10 */ +            __asm__("nop"); +            __asm__(".p2align 3"); +#    endif +#  endif +#endif + +            /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ +            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { +                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +                assert(!ZSTD_isError(oneSeqSize)); +                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif +                if (UNLIKELY(ZSTD_isError(oneSeqSize))) +                    return oneSeqSize; +                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +                op += oneSeqSize; +                if (UNLIKELY(!--nbSeq)) +                    break; +                BIT_reloadDStream(&(seqState.DStream)); +                sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +            } + +            /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ +            if (nbSeq > 0) { +                const size_t leftoverLit = dctx->litBufferEnd - litPtr; +                if (leftoverLit) +                { +                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); +                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); +                    sequence.litLength -= leftoverLit; +                    op += leftoverLit; +                } +                litPtr = dctx->litExtraBuffer; +                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; +                dctx->litBufferLocation = ZSTD_not_in_dst; +                { +                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +                    assert(!ZSTD_isError(oneSeqSize)); +                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif +                    if (UNLIKELY(ZSTD_isError(oneSeqSize))) +                        return oneSeqSize; +                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +                    op += oneSeqSize; +                    if (--nbSeq) +                        BIT_reloadDStream(&(seqState.DStream)); +                } +            } +        } + +        if (nbSeq > 0) /* there is remaining lit from extra buffer */ +        { + +#if defined(__GNUC__) && defined(__x86_64__) +            __asm__(".p2align 6"); +            __asm__("nop"); +#  if __GNUC__ != 7 +            /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and clang */ +            __asm__(".p2align 4"); +            __asm__("nop"); +            __asm__(".p2align 3"); +#  elif __GNUC__ >= 11 +            __asm__(".p2align 3"); +#  else +            __asm__(".p2align 5"); +            __asm__("nop"); +            __asm__(".p2align 3"); +#  endif +#endif + +            for (; ; ) { +                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +                size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +                assert(!ZSTD_isError(oneSeqSize)); +                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif +                if (UNLIKELY(ZSTD_isError(oneSeqSize))) +                    return oneSeqSize; +                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +                op += oneSeqSize; +                if (UNLIKELY(!--nbSeq)) +                    break; +                BIT_reloadDStream(&(seqState.DStream)); +            } +        } + +        /* check if reached exact end */ +        DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); +        RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); +        /* save reps for next block */ +        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } +    } + +    /* last literal segment */ +    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ +    { +        size_t const lastLLSize = litBufferEnd - litPtr; +        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); +        if (op != NULL) { +            ZSTD_memmove(op, litPtr, lastLLSize); +            op += lastLLSize; +        } +        litPtr = dctx->litExtraBuffer; +        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; +        dctx->litBufferLocation = ZSTD_not_in_dst; +    } +    {   size_t const lastLLSize = litBufferEnd - litPtr; +        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); +        if (op != NULL) { +            ZSTD_memcpy(op, litPtr, lastLLSize); +            op += lastLLSize; +        } +    } + +    return op-ostart; +} + +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, +    void* dst, size_t maxDstSize, +    const void* seqStart, size_t seqSize, int nbSeq, +    const ZSTD_longOffset_e isLongOffset, +    const int frame) +{ +    const BYTE* ip = (const BYTE*)seqStart; +    const BYTE* const iend = ip + seqSize; +    BYTE* const ostart = (BYTE*)dst; +    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; +    BYTE* op = ostart; +    const BYTE* litPtr = dctx->litPtr; +    const BYTE* const litEnd = litPtr + dctx->litSize; +    const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); +    const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); +    const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +    DEBUGLOG(5, "ZSTD_decompressSequences_body"); +    (void)frame; + +    /* Regen sequences */ +    if (nbSeq) { +        seqState_t seqState; +        dctx->fseEntropy = 1; +        { U32 i; for (i = 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; } +        RETURN_ERROR_IF( +            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), +            corruption_detected, ""); +        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); +        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); +        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); +        assert(dst != NULL); + +        ZSTD_STATIC_ASSERT( +            BIT_DStream_unfinished < BIT_DStream_completed && +            BIT_DStream_endOfBuffer < BIT_DStream_completed && +            BIT_DStream_completed < BIT_DStream_overflow); + +#if defined(__GNUC__) && defined(__x86_64__) +            __asm__(".p2align 6"); +            __asm__("nop"); +#  if __GNUC__ >= 7 +            __asm__(".p2align 5"); +            __asm__("nop"); +            __asm__(".p2align 3");  #  else -        __asm__(".p2align 4"); +            __asm__(".p2align 4"); +            __asm__("nop"); +            __asm__(".p2align 3");  #  endif  #endif +          for ( ; ; ) {              seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);              size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); @@ -1218,6 +1632,16 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,  {      return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);  } + +static size_t +ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, +                                               void* dst, size_t maxDstSize, +                                         const void* seqStart, size_t seqSize, int nbSeq, +                                         const ZSTD_longOffset_e isLongOffset, +                                         const int frame) +{ +    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +}  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT @@ -1250,10 +1674,10 @@ ZSTD_decompressSequencesLong_body(      const BYTE* ip = (const BYTE*)seqStart;      const BYTE* const iend = ip + seqSize;      BYTE* const ostart = (BYTE*)dst; -    BYTE* const oend = ostart + maxDstSize; +    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;      BYTE* op = ostart;      const BYTE* litPtr = dctx->litPtr; -    const BYTE* const litEnd = litPtr + dctx->litSize; +    const BYTE* litBufferEnd = dctx->litBufferEnd;      const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);      const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);      const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); @@ -1289,32 +1713,94 @@ ZSTD_decompressSequencesLong_body(          }          RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, ""); -        /* decode and decompress */ -        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) { -            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); -            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); +        /* decompress without stomping litBuffer */ +        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) { +            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +            size_t oneSeqSize; + +            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) +            { +                /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ +                const size_t leftoverLit = dctx->litBufferEnd - litPtr; +                if (leftoverLit) +                { +                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); +                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); +                    sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength -= leftoverLit; +                    op += leftoverLit; +                } +                litPtr = dctx->litExtraBuffer; +                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; +                dctx->litBufferLocation = ZSTD_not_in_dst; +                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) -            assert(!ZSTD_isError(oneSeqSize)); -            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +                assert(!ZSTD_isError(oneSeqSize)); +                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);  #endif -            if (ZSTD_isError(oneSeqSize)) return oneSeqSize; +                if (ZSTD_isError(oneSeqSize)) return oneSeqSize; -            prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); -            sequences[seqNb & STORED_SEQS_MASK] = sequence; -            op += oneSeqSize; +                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); +                sequences[seqNb & STORED_SEQS_MASK] = sequence; +                op += oneSeqSize; +            } +            else +            { +                /* lit buffer is either wholly contained in first or second split, or not split at all*/ +                oneSeqSize = dctx->litBufferLocation == ZSTD_split ? +                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : +                    ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +                assert(!ZSTD_isError(oneSeqSize)); +                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif +                if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); +                sequences[seqNb & STORED_SEQS_MASK] = sequence; +                op += oneSeqSize; +            }          }          RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");          /* finish queue */          seqNb -= seqAdvance;          for ( ; seqNb<nbSeq ; seqNb++) { -            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd); +            seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]); +            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) +            { +                const size_t leftoverLit = dctx->litBufferEnd - litPtr; +                if (leftoverLit) +                { +                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); +                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); +                    sequence->litLength -= leftoverLit; +                    op += leftoverLit; +                } +                litPtr = dctx->litExtraBuffer; +                litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; +                dctx->litBufferLocation = ZSTD_not_in_dst; +                { +                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);  #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) -            assert(!ZSTD_isError(oneSeqSize)); -            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +                    assert(!ZSTD_isError(oneSeqSize)); +                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);  #endif -            if (ZSTD_isError(oneSeqSize)) return oneSeqSize; -            op += oneSeqSize; +                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize; +                    op += oneSeqSize; +                } +            } +            else +            { +                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? +                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : +                    ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +                assert(!ZSTD_isError(oneSeqSize)); +                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif +                if (ZSTD_isError(oneSeqSize)) return oneSeqSize; +                op += oneSeqSize; +            }          }          /* save reps for next block */ @@ -1322,10 +1808,21 @@ ZSTD_decompressSequencesLong_body(      }      /* last literal segment */ -    {   size_t const lastLLSize = litEnd - litPtr; +    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */ +    { +        size_t const lastLLSize = litBufferEnd - litPtr; +        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); +        if (op != NULL) { +            ZSTD_memmove(op, litPtr, lastLLSize); +            op += lastLLSize; +        } +        litPtr = dctx->litExtraBuffer; +        litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; +    } +    {   size_t const lastLLSize = litBufferEnd - litPtr;          RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");          if (op != NULL) { -            ZSTD_memcpy(op, litPtr, lastLLSize); +            ZSTD_memmove(op, litPtr, lastLLSize);              op += lastLLSize;          }      } @@ -1349,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,  #if DYNAMIC_BMI2  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t  DONT_VECTORIZE  ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,                                   void* dst, size_t maxDstSize, @@ -1359,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,  {      return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);  } +static BMI2_TARGET_ATTRIBUTE size_t +DONT_VECTORIZE +ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, +                                 void* dst, size_t maxDstSize, +                           const void* seqStart, size_t seqSize, int nbSeq, +                           const ZSTD_longOffset_e isLongOffset, +                           const int frame) +{ +    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +}  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t  ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,                                   void* dst, size_t maxDstSize,                             const void* seqStart, size_t seqSize, int nbSeq, @@ -1391,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,  {      DEBUGLOG(5, "ZSTD_decompressSequences");  #if DYNAMIC_BMI2 -    if (dctx->bmi2) { +    if (ZSTD_DCtx_get_bmi2(dctx)) {          return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);      }  #endif -  return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +static size_t +ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, +                                 const void* seqStart, size_t seqSize, int nbSeq, +                                 const ZSTD_longOffset_e isLongOffset, +                                 const int frame) +{ +    DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); +#if DYNAMIC_BMI2 +    if (ZSTD_DCtx_get_bmi2(dctx)) { +        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +    } +#endif +    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);  }  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ @@ -1415,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,  {      DEBUGLOG(5, "ZSTD_decompressSequencesLong");  #if DYNAMIC_BMI2 -    if (dctx->bmi2) { +    if (ZSTD_DCtx_get_bmi2(dctx)) {          return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);      }  #endif @@ -1456,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)  size_t  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,                                void* dst, size_t dstCapacity, -                        const void* src, size_t srcSize, const int frame) +                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)  {   /* blockType == blockCompressed */      const BYTE* ip = (const BYTE*)src;      /* isLongOffset must be true if there are long offsets. @@ -1471,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,      RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");      /* Decode literals section */ -    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); +    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);          DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);          if (ZSTD_isError(litCSize)) return litCSize;          ip += litCSize; @@ -1519,7 +2040,10 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG          /* else */ -        return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); +        if (dctx->litBufferLocation == ZSTD_split) +            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); +        else +            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);  #endif      }  } @@ -1542,7 +2066,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,  {      size_t dSize;      ZSTD_checkContinuity(dctx, dst, dstCapacity); -    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); +    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);      dctx->previousDstEnd = (char*)dst + dSize;      return dSize;  } diff --git a/thirdparty/zstd/decompress/zstd_decompress_block.h b/thirdparty/zstd/decompress/zstd_decompress_block.h index 049a0cd84c..c61a9d0c4b 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_block.h +++ b/thirdparty/zstd/decompress/zstd_decompress_block.h @@ -33,6 +33,12 @@   */ + /* Streaming state is used to inform allocation of the literal buffer */ +typedef enum { +    not_streaming = 0, +    is_streaming = 1 +} streaming_operation; +  /* ZSTD_decompressBlock_internal() :   * decompress block, starting at `src`,   * into destination buffer `dst`. @@ -41,7 +47,7 @@   */  size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,                                 void* dst, size_t dstCapacity, -                         const void* src, size_t srcSize, const int frame); +                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);  /* ZSTD_buildFSETable() :   * generate FSE decoding table for one symbol (ll, ml or off) @@ -54,7 +60,7 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,   */  void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,               const short* normalizedCounter, unsigned maxSymbolValue, -             const U32* baseValue, const U32* nbAdditionalBits, +             const U32* baseValue, const U8* nbAdditionalBits,                     unsigned tableLog, void* wksp, size_t wkspSize,                     int bmi2); diff --git a/thirdparty/zstd/decompress/zstd_decompress_internal.h b/thirdparty/zstd/decompress/zstd_decompress_internal.h index ebda0c9031..2b5a53850a 100644 --- a/thirdparty/zstd/decompress/zstd_decompress_internal.h +++ b/thirdparty/zstd/decompress/zstd_decompress_internal.h @@ -20,7 +20,7 @@   *  Dependencies   *********************************************************/  #include "../common/mem.h"             /* BYTE, U16, U32 */ -#include "../common/zstd_internal.h"   /* ZSTD_seqSymbol */ +#include "../common/zstd_internal.h"   /* constants : MaxLL, MaxML, MaxOff, LLFSELog, etc. */ @@ -40,7 +40,7 @@ static UNUSED_ATTR const U32 OF_base[MaxOff+1] = {                   0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,                   0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; -static UNUSED_ATTR const U32 OF_bits[MaxOff+1] = { +static UNUSED_ATTR const U8 OF_bits[MaxOff+1] = {                       0,  1,  2,  3,  4,  5,  6,  7,                       8,  9, 10, 11, 12, 13, 14, 15,                      16, 17, 18, 19, 20, 21, 22, 23, @@ -106,6 +106,22 @@ typedef struct {      size_t ddictPtrCount;  } ZSTD_DDictHashSet; +#ifndef ZSTD_DECODER_INTERNAL_BUFFER +#  define ZSTD_DECODER_INTERNAL_BUFFER  (1 << 16) +#endif + +#define ZSTD_LBMIN 64 +#define ZSTD_LBMAX (128 << 10) + +/* extra buffer, compensates when dst is not large enough to store litBuffer */ +#define ZSTD_LITBUFFEREXTRASIZE  BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL_BUFFER, ZSTD_LBMAX) + +typedef enum { +    ZSTD_not_in_dst = 0,  /* Stored entirely within litExtraBuffer */ +    ZSTD_in_dst = 1,           /* Stored entirely within dst (in memory after current output write) */ +    ZSTD_split = 2            /* Split between litExtraBuffer and dst */ +} ZSTD_litLocation_e; +  struct ZSTD_DCtx_s  {      const ZSTD_seqSymbol* LLTptr; @@ -136,7 +152,9 @@ struct ZSTD_DCtx_s      size_t litSize;      size_t rleSize;      size_t staticSize; +#if DYNAMIC_BMI2 != 0      int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ +#endif      /* dictionary */      ZSTD_DDict* ddictLocal; @@ -158,16 +176,21 @@ struct ZSTD_DCtx_s      size_t outStart;      size_t outEnd;      size_t lhSize; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)      void* legacyContext;      U32 previousLegacyVersion;      U32 legacyVersion; +#endif      U32 hostageByte;      int noForwardProgress;      ZSTD_bufferMode_e outBufferMode;      ZSTD_outBuffer expectedOutBuffer;      /* workspace */ -    BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; +    BYTE* litBuffer; +    const BYTE* litBufferEnd; +    ZSTD_litLocation_e litBufferLocation; +    BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /* literal buffer can be split between storage within dst and within this scratch buffer */      BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];      size_t oversizedDuration; @@ -183,6 +206,14 @@ struct ZSTD_DCtx_s  #endif  };  /* typedef'd to ZSTD_DCtx within "zstd.h" */ +MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +#if DYNAMIC_BMI2 != 0 +	return dctx->bmi2; +#else +    (void)dctx; +	return 0; +#endif +}  /*-*******************************************************   *  Shared internal functions  |