diff options
Diffstat (limited to 'thirdparty/libwebp/src/dsp')
-rw-r--r-- | thirdparty/libwebp/src/dsp/dsp.h | 6 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/enc.c | 6 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/enc_avx2.c | 21 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless.c | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless.h | 14 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc.c | 86 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc_mips32.c | 79 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc_sse2.c | 44 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/msa_macro.h | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/rescaler.c | 4 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/rescaler_mips32.c | 4 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c | 10 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/rescaler_msa.c | 7 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/rescaler_neon.c | 18 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/rescaler_sse2.c | 35 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/yuv.h | 2 |
16 files changed, 170 insertions, 170 deletions
diff --git a/thirdparty/libwebp/src/dsp/dsp.h b/thirdparty/libwebp/src/dsp/dsp.h index 4ab77a5130..fafc2d05d3 100644 --- a/thirdparty/libwebp/src/dsp/dsp.h +++ b/thirdparty/libwebp/src/dsp/dsp.h @@ -76,10 +76,6 @@ extern "C" { #define WEBP_USE_SSE41 #endif -#if defined(__AVX2__) || defined(WEBP_HAVE_AVX2) -#define WEBP_USE_AVX2 -#endif - // The intrinsics currently cause compiler errors with arm-nacl-gcc and the // inline assembly would need to be modified for use with Native Client. #if (defined(__ARM_NEON__) || \ @@ -679,4 +675,4 @@ void VP8FiltersInit(void); } // extern "C" #endif -#endif /* WEBP_DSP_DSP_H_ */ +#endif // WEBP_DSP_DSP_H_ diff --git a/thirdparty/libwebp/src/dsp/enc.c b/thirdparty/libwebp/src/dsp/enc.c index fa23b40a30..2fddbc4c52 100644 --- a/thirdparty/libwebp/src/dsp/enc.c +++ b/thirdparty/libwebp/src/dsp/enc.c @@ -734,7 +734,6 @@ VP8BlockCopy VP8Copy16x8; extern void VP8EncDspInitSSE2(void); extern void VP8EncDspInitSSE41(void); -extern void VP8EncDspInitAVX2(void); extern void VP8EncDspInitNEON(void); extern void VP8EncDspInitMIPS32(void); extern void VP8EncDspInitMIPSdspR2(void); @@ -784,11 +783,6 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { #endif } #endif -#if defined(WEBP_USE_AVX2) - if (VP8GetCPUInfo(kAVX2)) { - VP8EncDspInitAVX2(); - } -#endif #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { VP8EncDspInitMIPS32(); diff --git a/thirdparty/libwebp/src/dsp/enc_avx2.c b/thirdparty/libwebp/src/dsp/enc_avx2.c deleted file mode 100644 index 8bc5798fee..0000000000 --- a/thirdparty/libwebp/src/dsp/enc_avx2.c +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2014 Google Inc. All Rights Reserved. -// -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. -// ----------------------------------------------------------------------------- -// -// AVX2 version of speed-critical encoding functions. - -#include "src/dsp/dsp.h" - -#if defined(WEBP_USE_AVX2) - -#endif // WEBP_USE_AVX2 - -//------------------------------------------------------------------------------ -// Entry point - -WEBP_DSP_INIT_STUB(VP8EncDspInitAVX2) diff --git a/thirdparty/libwebp/src/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c index f9b3c182d3..d21aa6a0a0 100644 --- a/thirdparty/libwebp/src/dsp/lossless.c +++ b/thirdparty/libwebp/src/dsp/lossless.c @@ -23,8 +23,6 @@ #include "src/dsp/lossless.h" #include "src/dsp/lossless_common.h" -#define MAX_DIFF_COST (1e30f) - //------------------------------------------------------------------------------ // Image transforms. diff --git a/thirdparty/libwebp/src/dsp/lossless.h b/thirdparty/libwebp/src/dsp/lossless.h index b2bbdfc93c..f709cc86b2 100644 --- a/thirdparty/libwebp/src/dsp/lossless.h +++ b/thirdparty/libwebp/src/dsp/lossless.h @@ -163,7 +163,7 @@ extern VP8LCostCombinedFunc VP8LExtraCostCombined; extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; typedef struct { // small struct to hold counters - int counts[2]; // index: 0=zero steak, 1=non-zero streak + int counts[2]; // index: 0=zero streak, 1=non-zero streak int streaks[2][2]; // [zero/non-zero][streak<3 / streak>=3] } VP8LStreaks; @@ -194,10 +194,14 @@ extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined; void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n, VP8LBitEntropy* const entropy); -typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out); -extern VP8LHistogramAddFunc VP8LHistogramAdd; +typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b, + uint32_t* out, int size); +extern VP8LAddVectorFunc VP8LAddVector; +typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size); +extern VP8LAddVectorEqFunc VP8LAddVectorEq; +void VP8LHistogramAdd(const VP8LHistogram* const a, + const VP8LHistogram* const b, + VP8LHistogram* const out); // ----------------------------------------------------------------------------- // PrefixEncode() diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c index d608326fef..1408fbf580 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc.c @@ -632,38 +632,67 @@ static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, //------------------------------------------------------------------------------ -static void HistogramAdd_C(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out) { +static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out, + int size) { + int i; + for (i = 0; i < size; ++i) out[i] = a[i] + b[i]; +} + +static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) { + int i; + for (i = 0; i < size; ++i) out[i] += a[i]; +} + +#define ADD(X, ARG, LEN) do { \ + if (a->is_used_[X]) { \ + if (b->is_used_[X]) { \ + VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN)); \ + } else { \ + memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \ + } \ + } else if (b->is_used_[X]) { \ + memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0])); \ + } else { \ + memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0])); \ + } \ +} while (0) + +#define ADD_EQ(X, ARG, LEN) do { \ + if (a->is_used_[X]) { \ + if (out->is_used_[X]) { \ + VP8LAddVectorEq(a->ARG, out->ARG, (LEN)); \ + } else { \ + memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \ + } \ + } \ +} while (0) + +void VP8LHistogramAdd(const VP8LHistogram* const a, + const VP8LHistogram* const b, VP8LHistogram* const out) { int i; const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); assert(a->palette_code_bits_ == b->palette_code_bits_); + if (b != out) { - for (i = 0; i < literal_size; ++i) { - out->literal_[i] = a->literal_[i] + b->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] = a->distance_[i] + b->distance_[i]; - } - for (i = 0; i < NUM_LITERAL_CODES; ++i) { - out->red_[i] = a->red_[i] + b->red_[i]; - out->blue_[i] = a->blue_[i] + b->blue_[i]; - out->alpha_[i] = a->alpha_[i] + b->alpha_[i]; + ADD(0, literal_, literal_size); + ADD(1, red_, NUM_LITERAL_CODES); + ADD(2, blue_, NUM_LITERAL_CODES); + ADD(3, alpha_, NUM_LITERAL_CODES); + ADD(4, distance_, NUM_DISTANCE_CODES); + for (i = 0; i < 5; ++i) { + out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]); } } else { - for (i = 0; i < literal_size; ++i) { - out->literal_[i] += a->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] += a->distance_[i]; - } - for (i = 0; i < NUM_LITERAL_CODES; ++i) { - out->red_[i] += a->red_[i]; - out->blue_[i] += a->blue_[i]; - out->alpha_[i] += a->alpha_[i]; - } + ADD_EQ(0, literal_, literal_size); + ADD_EQ(1, red_, NUM_LITERAL_CODES); + ADD_EQ(2, blue_, NUM_LITERAL_CODES); + ADD_EQ(3, alpha_, NUM_LITERAL_CODES); + ADD_EQ(4, distance_, NUM_DISTANCE_CODES); + for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i]; } } +#undef ADD +#undef ADD_EQ //------------------------------------------------------------------------------ // Image transforms. @@ -848,7 +877,8 @@ VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy; VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined; VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined; -VP8LHistogramAddFunc VP8LHistogramAdd; +VP8LAddVectorFunc VP8LAddVector; +VP8LAddVectorEqFunc VP8LAddVectorEq; VP8LVectorMismatchFunc VP8LVectorMismatch; VP8LBundleColorMapFunc VP8LBundleColorMap; @@ -885,7 +915,8 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) { VP8LGetEntropyUnrefined = GetEntropyUnrefined_C; VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C; - VP8LHistogramAdd = HistogramAdd_C; + VP8LAddVector = AddVector_C; + VP8LAddVectorEq = AddVectorEq_C; VP8LVectorMismatch = VectorMismatch_C; VP8LBundleColorMap = VP8LBundleColorMap_C; @@ -971,7 +1002,8 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) { assert(VP8LCombinedShannonEntropy != NULL); assert(VP8LGetEntropyUnrefined != NULL); assert(VP8LGetCombinedEntropyUnrefined != NULL); - assert(VP8LHistogramAdd != NULL); + assert(VP8LAddVector != NULL); + assert(VP8LAddVectorEq != NULL); assert(VP8LVectorMismatch != NULL); assert(VP8LBundleColorMap != NULL); assert(VP8LPredictorsSub[0] != NULL); diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c index e7b58f4e8c..0412a093cf 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c @@ -344,65 +344,29 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], ASM_END_COMMON_0 \ ASM_END_COMMON_1 -#define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \ - const uint32_t* pa = (const uint32_t*)(A); \ - const uint32_t* pb = (const uint32_t*)(B); \ - uint32_t* pout = (uint32_t*)(OUT); \ - const uint32_t* const LoopEnd = pa + (SIZE); \ - assert((SIZE) % 4 == 0); \ - ASM_START \ - ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \ - ASM_END_0 \ - if ((EXTRA_SIZE) > 0) { \ - const int last = (EXTRA_SIZE); \ - int i; \ - for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \ - } \ -} while (0) - -#define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \ - const uint32_t* pa = (const uint32_t*)(A); \ - uint32_t* pout = (uint32_t*)(OUT); \ - const uint32_t* const LoopEnd = pa + (SIZE); \ - assert((SIZE) % 4 == 0); \ - ASM_START \ - ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \ - ASM_END_1 \ - if ((EXTRA_SIZE) > 0) { \ - const int last = (EXTRA_SIZE); \ - int i; \ - for (i = 0; i < last; ++i) pout[i] += pa[i]; \ - } \ -} while (0) - -static void HistogramAdd_MIPS32(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out) { +static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, + uint32_t* pout, int size) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_) - - (NUM_LITERAL_CODES + NUM_LENGTH_CODES); - assert(a->palette_code_bits_ == b->palette_code_bits_); - - if (b != out) { - ADD_VECTOR(a->literal_, b->literal_, out->literal_, - NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); - ADD_VECTOR(a->distance_, b->distance_, out->distance_, - NUM_DISTANCE_CODES, 0); - ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0); - ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0); - ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); - } else { - ADD_VECTOR_EQ(a->literal_, out->literal_, - NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); - ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0); - ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0); - ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0); - ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); - } + const uint32_t end = ((size) / 4) * 4; + const uint32_t* const LoopEnd = pa + end; + int i; + ASM_START + ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) + ASM_END_0 + for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i]; +} + +static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) { + uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + const uint32_t end = ((size) / 4) * 4; + const uint32_t* const LoopEnd = pa + end; + int i; + ASM_START + ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) + ASM_END_1 + for (i = end; i < size; ++i) pout[i] += pa[i]; } -#undef ADD_VECTOR_EQ -#undef ADD_VECTOR #undef ASM_END_1 #undef ASM_END_0 #undef ASM_END_COMMON_1 @@ -422,7 +386,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { VP8LExtraCostCombined = ExtraCostCombined_MIPS32; VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32; VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32; - VP8LHistogramAdd = HistogramAdd_MIPS32; + VP8LAddVector = AddVector_MIPS32; + VP8LAddVectorEq = AddVectorEq_MIPS32; } #else // !WEBP_USE_MIPS32 diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c index f84a9909e1..36478c4912 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c @@ -170,12 +170,13 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride, //------------------------------------------------------------------------------ +// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But +// that's ok since the histogram values are less than 1<<28 (max picture size). #define LINE_SIZE 16 // 8 or 16 static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out, int size) { int i; - assert(size % LINE_SIZE == 0); - for (i = 0; i < size; i += LINE_SIZE) { + for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); #if (LINE_SIZE == 16) @@ -195,12 +196,14 @@ static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out, _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); #endif } + for (; i < size; ++i) { + out[i] = a[i] + b[i]; + } } static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) { int i; - assert(size % LINE_SIZE == 0); - for (i = 0; i < size; i += LINE_SIZE) { + for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) { const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]); const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]); #if (LINE_SIZE == 16) @@ -220,35 +223,11 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) { _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3)); #endif } -} -#undef LINE_SIZE - -// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But -// that's ok since the histogram values are less than 1<<28 (max picture size). -static void HistogramAdd_SSE2(const VP8LHistogram* const a, - const VP8LHistogram* const b, - VP8LHistogram* const out) { - int i; - const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_); - assert(a->palette_code_bits_ == b->palette_code_bits_); - if (b != out) { - AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES); - AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES); - AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES); - AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES); - } else { - AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES); - AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES); - AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES); - AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES); - } - for (i = NUM_LITERAL_CODES; i < literal_size; ++i) { - out->literal_[i] = a->literal_[i] + b->literal_[i]; - } - for (i = 0; i < NUM_DISTANCE_CODES; ++i) { - out->distance_[i] = a->distance_[i] + b->distance_[i]; + for (; i < size; ++i) { + out[i] += a[i]; } } +#undef LINE_SIZE //------------------------------------------------------------------------------ // Entropy @@ -675,7 +654,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { VP8LTransformColor = TransformColor_SSE2; VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2; VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2; - VP8LHistogramAdd = HistogramAdd_SSE2; + VP8LAddVector = AddVector_SSE2; + VP8LAddVectorEq = AddVectorEq_SSE2; VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2; VP8LVectorMismatch = VectorMismatch_SSE2; VP8LBundleColorMap = BundleColorMap_SSE2; diff --git a/thirdparty/libwebp/src/dsp/msa_macro.h b/thirdparty/libwebp/src/dsp/msa_macro.h index dfacda6ccd..de026a1d9e 100644 --- a/thirdparty/libwebp/src/dsp/msa_macro.h +++ b/thirdparty/libwebp/src/dsp/msa_macro.h @@ -1389,4 +1389,4 @@ static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) { } while (0) #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) -#endif /* WEBP_DSP_MSA_MACRO_H_ */ +#endif // WEBP_DSP_MSA_MACRO_H_ diff --git a/thirdparty/libwebp/src/dsp/rescaler.c b/thirdparty/libwebp/src/dsp/rescaler.c index f307d35056..753f84fcf4 100644 --- a/thirdparty/libwebp/src/dsp/rescaler.c +++ b/thirdparty/libwebp/src/dsp/rescaler.c @@ -21,6 +21,7 @@ #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) //------------------------------------------------------------------------------ // Row import @@ -138,7 +139,7 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) { if (yscale) { for (x_out = 0; x_out < x_out_max; ++x_out) { const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale); - const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); + const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale); assert(v >= 0 && v <= 255); dst[x_out] = v; irow[x_out] = frac; // new fractional start @@ -153,6 +154,7 @@ void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) { } } +#undef MULT_FIX_FLOOR #undef MULT_FIX #undef ROUNDER diff --git a/thirdparty/libwebp/src/dsp/rescaler_mips32.c b/thirdparty/libwebp/src/dsp/rescaler_mips32.c index 542f7e5970..61f63c616c 100644 --- a/thirdparty/libwebp/src/dsp/rescaler_mips32.c +++ b/thirdparty/libwebp/src/dsp/rescaler_mips32.c @@ -209,6 +209,7 @@ static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) { } } +#if 0 // disabled for now. TODO(skal): make match the C-code static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) { const int x_out_max = wrk->dst_width * wrk->num_channels; uint8_t* dst = wrk->dst; @@ -273,6 +274,7 @@ static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) { ); } } +#endif // 0 //------------------------------------------------------------------------------ // Entry point @@ -283,7 +285,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) { WebPRescalerImportRowExpand = ImportRowExpand_MIPS32; WebPRescalerImportRowShrink = ImportRowShrink_MIPS32; WebPRescalerExportRowExpand = ExportRowExpand_MIPS32; - WebPRescalerExportRowShrink = ExportRowShrink_MIPS32; +// WebPRescalerExportRowShrink = ExportRowShrink_MIPS32; } #else // !WEBP_USE_MIPS32 diff --git a/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c index b78aac15e6..ce9e64862e 100644 --- a/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c +++ b/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c @@ -20,10 +20,12 @@ #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) //------------------------------------------------------------------------------ // Row export +#if 0 // disabled for now. TODO(skal): make match the C-code static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { int i; const int x_out_max = wrk->dst_width * wrk->num_channels; @@ -106,7 +108,7 @@ static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { } for (i = 0; i < (x_out_max & 0x3); ++i) { const uint32_t frac = (uint32_t)MULT_FIX(*frow++, yscale); - const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale); + const int v = (int)MULT_FIX_FLOOR(*irow - frac, wrk->fxy_scale); assert(v >= 0 && v <= 255); *dst++ = v; *irow++ = frac; // new fractional start @@ -154,13 +156,14 @@ static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { ); } for (i = 0; i < (x_out_max & 0x3); ++i) { - const int v = (int)MULT_FIX(*irow, wrk->fxy_scale); + const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale); assert(v >= 0 && v <= 255); *dst++ = v; *irow++ = 0; } } } +#endif // 0 static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { int i; @@ -294,6 +297,7 @@ static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { } } +#undef MULT_FIX_FLOOR #undef MULT_FIX #undef ROUNDER @@ -304,7 +308,7 @@ extern void WebPRescalerDspInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) { WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2; - WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2; +// WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/thirdparty/libwebp/src/dsp/rescaler_msa.c b/thirdparty/libwebp/src/dsp/rescaler_msa.c index f3bc99f1cd..c559254836 100644 --- a/thirdparty/libwebp/src/dsp/rescaler_msa.c +++ b/thirdparty/libwebp/src/dsp/rescaler_msa.c @@ -22,6 +22,7 @@ #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \ v4u32 tmp0, tmp1, tmp2, tmp3; \ @@ -262,6 +263,7 @@ static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { } } +#if 0 // disabled for now. TODO(skal): make match the C-code static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, uint8_t* dst, int length, const uint32_t yscale, @@ -341,7 +343,7 @@ static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, } for (x_out = 0; x_out < length; ++x_out) { const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale); - const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); + const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale); assert(v >= 0 && v <= 255); dst[x_out] = v; irow[x_out] = frac; @@ -426,6 +428,7 @@ static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { ExportRowShrink_1(irow, dst, x_out_max, wrk); } } +#endif // 0 //------------------------------------------------------------------------------ // Entry point @@ -434,7 +437,7 @@ extern void WebPRescalerDspInitMSA(void); WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) { WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2; - WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2; +// WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2; } #else // !WEBP_USE_MSA diff --git a/thirdparty/libwebp/src/dsp/rescaler_neon.c b/thirdparty/libwebp/src/dsp/rescaler_neon.c index 3eff9fbaf4..a553f06f79 100644 --- a/thirdparty/libwebp/src/dsp/rescaler_neon.c +++ b/thirdparty/libwebp/src/dsp/rescaler_neon.c @@ -22,6 +22,7 @@ #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) #define LOAD_32x4(SRC, DST) const uint32x4_t DST = vld1q_u32((SRC)) #define LOAD_32x8(SRC, DST0, DST1) \ @@ -35,8 +36,11 @@ #if (WEBP_RESCALER_RFIX == 32) #define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1)) -#define MULT_FIX(A, B) /* note: B is actualy scale>>1. See MAKE_HALF_CST */ \ +// note: B is actualy scale>>1. See MAKE_HALF_CST +#define MULT_FIX(A, B) \ vreinterpretq_u32_s32(vqrdmulhq_s32(vreinterpretq_s32_u32((A)), (B))) +#define MULT_FIX_FLOOR(A, B) \ + vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32((A)), (B))) #else #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work" #endif @@ -135,8 +139,8 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) { const uint32x4_t A1 = MULT_FIX(in1, yscale_half); const uint32x4_t B0 = vqsubq_u32(in2, A0); const uint32x4_t B1 = vqsubq_u32(in3, A1); - const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half); - const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half); + const uint32x4_t C0 = MULT_FIX_FLOOR(B0, fxy_scale_half); + const uint32x4_t C1 = MULT_FIX_FLOOR(B1, fxy_scale_half); const uint16x4_t D0 = vmovn_u32(C0); const uint16x4_t D1 = vmovn_u32(C1); const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1)); @@ -145,7 +149,7 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) { } for (; x_out < x_out_max; ++x_out) { const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale); - const int v = (int)MULT_FIX_C(irow[x_out] - frac, wrk->fxy_scale); + const int v = (int)MULT_FIX_FLOOR_C(irow[x_out] - frac, fxy_scale); assert(v >= 0 && v <= 255); dst[x_out] = v; irow[x_out] = frac; // new fractional start @@ -170,6 +174,12 @@ static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) { } } +#undef MULT_FIX_FLOOR_C +#undef MULT_FIX_C +#undef MULT_FIX_FLOOR +#undef MULT_FIX +#undef ROUNDER + //------------------------------------------------------------------------------ extern void WebPRescalerDspInitNEON(void); diff --git a/thirdparty/libwebp/src/dsp/rescaler_sse2.c b/thirdparty/libwebp/src/dsp/rescaler_sse2.c index 64c50deab5..f7461a452c 100644 --- a/thirdparty/libwebp/src/dsp/rescaler_sse2.c +++ b/thirdparty/libwebp/src/dsp/rescaler_sse2.c @@ -25,6 +25,7 @@ #define ROUNDER (WEBP_RESCALER_ONE >> 1) #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) +#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) // input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0 static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) { @@ -224,6 +225,35 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0, _mm_storel_epi64((__m128i*)dst, G); } +static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0, + const __m128i* const A1, + const __m128i* const A2, + const __m128i* const A3, + const __m128i* const mult, + uint8_t* const dst) { + const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); + const __m128i B0 = _mm_mul_epu32(*A0, *mult); + const __m128i B1 = _mm_mul_epu32(*A1, *mult); + const __m128i B2 = _mm_mul_epu32(*A2, *mult); + const __m128i B3 = _mm_mul_epu32(*A3, *mult); + const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX); + const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX); +#if (WEBP_RESCALER_RFIX < 32) + const __m128i D2 = + _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask); + const __m128i D3 = + _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask); +#else + const __m128i D2 = _mm_and_si128(B2, mask); + const __m128i D3 = _mm_and_si128(B3, mask); +#endif + const __m128i E0 = _mm_or_si128(D0, D2); + const __m128i E1 = _mm_or_si128(D1, D3); + const __m128i F = _mm_packs_epi32(E0, E1); + const __m128i G = _mm_packus_epi16(F, F); + _mm_storel_epi64((__m128i*)dst, G); +} + static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) { int x_out; uint8_t* const dst = wrk->dst; @@ -322,12 +352,12 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) { const __m128i G1 = _mm_or_si128(D1, F3); _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0); _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1); - ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); + ProcessRow_Floor_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out); } } for (; x_out < x_out_max; ++x_out) { const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale); - const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); + const int v = (int)MULT_FIX_FLOOR(irow[x_out] - frac, wrk->fxy_scale); assert(v >= 0 && v <= 255); dst[x_out] = v; irow[x_out] = frac; // new fractional start @@ -352,6 +382,7 @@ static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) { } } +#undef MULT_FIX_FLOOR #undef MULT_FIX #undef ROUNDER diff --git a/thirdparty/libwebp/src/dsp/yuv.h b/thirdparty/libwebp/src/dsp/yuv.h index eb787270d2..c12be1d094 100644 --- a/thirdparty/libwebp/src/dsp/yuv.h +++ b/thirdparty/libwebp/src/dsp/yuv.h @@ -207,4 +207,4 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) { } // extern "C" #endif -#endif /* WEBP_DSP_YUV_H_ */ +#endif // WEBP_DSP_YUV_H_ |