diff options
Diffstat (limited to 'thirdparty/zstd/common/zstd_internal.h')
-rw-r--r-- | thirdparty/zstd/common/zstd_internal.h | 95 |
1 files changed, 37 insertions, 58 deletions
diff --git a/thirdparty/zstd/common/zstd_internal.h b/thirdparty/zstd/common/zstd_internal.h index 81b16eac2e..dcdcbdb81c 100644 --- a/thirdparty/zstd/common/zstd_internal.h +++ b/thirdparty/zstd/common/zstd_internal.h @@ -56,9 +56,9 @@ extern "C" { /** * Return the specified error if the condition evaluates to true. * - * In debug modes, prints additional information. In order to do that - * (particularly, printing the conditional that failed), this can't just wrap - * RETURN_ERROR(). + * In debug modes, prints additional information. + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). */ #define RETURN_ERROR_IF(cond, err, ...) \ if (cond) { \ @@ -197,79 +197,56 @@ static void ZSTD_copy8(void* dst, const void* src) { memcpy(dst, src, 8); } static void ZSTD_copy16(void* dst, const void* src) { memcpy(dst, src, 16); } #define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; } -#define WILDCOPY_OVERLENGTH 8 -#define VECLEN 16 +#define WILDCOPY_OVERLENGTH 32 +#define WILDCOPY_VECLEN 16 typedef enum { ZSTD_no_overlap, - ZSTD_overlap_src_before_dst, + ZSTD_overlap_src_before_dst /* ZSTD_overlap_dst_before_src, */ } ZSTD_overlap_e; /*! ZSTD_wildcopy() : - * custom version of memcpy(), can overwrite up to WILDCOPY_OVERLENGTH bytes (if length==0) */ + * Custom version of memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0) + * @param ovtype controls the overlap detection + * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. + * - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart. + * The src buffer must be before the dst buffer. + */ MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE -void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype) +void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype) { ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src; const BYTE* ip = (const BYTE*)src; BYTE* op = (BYTE*)dst; BYTE* const oend = op + length; - assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8)); - if (length < VECLEN || (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN)) { - do - COPY8(op, ip) - while (op < oend); - } - else { - if ((length & 8) == 0) - COPY8(op, ip); - do { + assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN)); + + if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { + /* Handle short offset copies. */ + do { + COPY8(op, ip) + } while (op < oend); + } else { + assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); + /* Separate out the first two COPY16() calls because the copy length is + * almost certain to be short, so the branches have different + * probabilities. + * On gcc-9 unrolling once is +1.6%, twice is +2%, thrice is +1.8%. + * On clang-8 unrolling once is +1.4%, twice is +3.3%, thrice is +3%. + */ COPY16(op, ip); - } - while (op < oend); - } -} - -/*! ZSTD_wildcopy_16min() : - * same semantics as ZSTD_wilcopy() except guaranteed to be able to copy 16 bytes at the start */ -MEM_STATIC FORCE_INLINE_ATTR DONT_VECTORIZE -void ZSTD_wildcopy_16min(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e ovtype) -{ - ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src; - const BYTE* ip = (const BYTE*)src; - BYTE* op = (BYTE*)dst; - BYTE* const oend = op + length; - - assert(length >= 8); - assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff < -8)); - - if (ovtype == ZSTD_overlap_src_before_dst && diff < VECLEN) { - do - COPY8(op, ip) - while (op < oend); - } - else { - if ((length & 8) == 0) - COPY8(op, ip); - do { COPY16(op, ip); - } - while (op < oend); + if (op >= oend) return; + do { + COPY16(op, ip); + COPY16(op, ip); + } + while (op < oend); } } -MEM_STATIC void ZSTD_wildcopy_e(void* dst, const void* src, void* dstEnd) /* should be faster for decoding, but strangely, not verified on all platform */ -{ - const BYTE* ip = (const BYTE*)src; - BYTE* op = (BYTE*)dst; - BYTE* const oend = (BYTE*)dstEnd; - do - COPY8(op, ip) - while (op < oend); -} - /*-******************************************* * Private declarations @@ -323,7 +300,9 @@ MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus _BitScanReverse(&r, val); return (unsigned)r; # elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */ - return 31 - __builtin_clz(val); + return __builtin_clz (val) ^ 31; +# elif defined(__ICCARM__) /* IAR Intrinsic */ + return 31 - __CLZ(val); # else /* Software version */ static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; U32 v = val; |