diff options
author | Rémi Verschelde <rverschelde@gmail.com> | 2021-11-19 13:54:40 +0100 |
---|---|---|
committer | Rémi Verschelde <rverschelde@gmail.com> | 2021-11-19 13:55:13 +0100 |
commit | 41ce417847ab5eec58aa0a5e618da2ee076e3d67 (patch) | |
tree | 8be256846710834b3f87cb3334b257fbb00e7cf3 /thirdparty/libwebp/src/dsp | |
parent | 42f8bfaff0dc5a94ca351b1eaadc42cb95655b87 (diff) |
libwebp: Sync with upstream 1.2.1
Changes: https://chromium.googlesource.com/webm/libwebp/+/1.2.1/NEWS
Diffstat (limited to 'thirdparty/libwebp/src/dsp')
24 files changed, 637 insertions, 473 deletions
diff --git a/thirdparty/libwebp/src/dsp/alpha_processing.c b/thirdparty/libwebp/src/dsp/alpha_processing.c index 819d1391f2..1892929a43 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing.c @@ -157,7 +157,8 @@ void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) { } } -void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha, +void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr, + const uint8_t* WEBP_RESTRICT const alpha, int width, int inverse) { int x; for (x = 0; x < width; ++x) { @@ -178,7 +179,8 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha, #undef MFIX void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse); -void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha, +void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr, + const uint8_t* WEBP_RESTRICT const alpha, int width, int inverse); //------------------------------------------------------------------------------ @@ -193,8 +195,8 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows, } } -void WebPMultRows(uint8_t* ptr, int stride, - const uint8_t* alpha, int alpha_stride, +void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride, + const uint8_t* WEBP_RESTRICT alpha, int alpha_stride, int width, int num_rows, int inverse) { int n; for (n = 0; n < num_rows; ++n) { @@ -290,9 +292,9 @@ static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444, } #if !WEBP_NEON_OMIT_C_CODE -static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride, +static int DispatchAlpha_C(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride, int width, int height, - uint8_t* dst, int dst_stride) { + uint8_t* WEBP_RESTRICT dst, int dst_stride) { uint32_t alpha_mask = 0xff; int i, j; @@ -309,9 +311,10 @@ static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride, return (alpha_mask != 0xff); } -static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint32_t* dst, int dst_stride) { +static void DispatchAlphaToGreen_C(const uint8_t* WEBP_RESTRICT alpha, + int alpha_stride, int width, int height, + uint32_t* WEBP_RESTRICT dst, + int dst_stride) { int i, j; for (j = 0; j < height; ++j) { for (i = 0; i < width; ++i) { @@ -322,9 +325,9 @@ static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride, } } -static int ExtractAlpha_C(const uint8_t* argb, int argb_stride, +static int ExtractAlpha_C(const uint8_t* WEBP_RESTRICT argb, int argb_stride, int width, int height, - uint8_t* alpha, int alpha_stride) { + uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { uint8_t alpha_mask = 0xff; int i, j; @@ -340,7 +343,8 @@ static int ExtractAlpha_C(const uint8_t* argb, int argb_stride, return (alpha_mask == 0xff); } -static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) { +static void ExtractGreen_C(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT alpha, int size) { int i; for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8; } @@ -359,6 +363,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) { return 0; } +static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) { + int x; + for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color; +} + //------------------------------------------------------------------------------ // Simple channel manipulations. @@ -367,8 +376,11 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) { } #ifdef WORDS_BIGENDIAN -static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g, - const uint8_t* b, int len, uint32_t* out) { +static void PackARGB_C(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT r, + const uint8_t* WEBP_RESTRICT g, + const uint8_t* WEBP_RESTRICT b, + int len, uint32_t* WEBP_RESTRICT out) { int i; for (i = 0; i < len; ++i) { out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]); @@ -376,8 +388,10 @@ static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g, } #endif -static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b, - int len, int step, uint32_t* out) { +static void PackRGB_C(const uint8_t* WEBP_RESTRICT r, + const uint8_t* WEBP_RESTRICT g, + const uint8_t* WEBP_RESTRICT b, + int len, int step, uint32_t* WEBP_RESTRICT out) { int i, offset = 0; for (i = 0; i < len; ++i) { out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]); @@ -387,19 +401,26 @@ static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b, void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int); void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int); -int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int); -void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int); -int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int); -void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size); +int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int, + uint8_t* WEBP_RESTRICT, int); +void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT, int, int, int, + uint32_t* WEBP_RESTRICT, int); +int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT, int, int, int, + uint8_t* WEBP_RESTRICT, int); +void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT alpha, int size); #ifdef WORDS_BIGENDIAN void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g, const uint8_t* b, int, uint32_t*); #endif -void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, - int len, int step, uint32_t* out); +void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r, + const uint8_t* WEBP_RESTRICT g, + const uint8_t* WEBP_RESTRICT b, + int len, int step, uint32_t* WEBP_RESTRICT out); int (*WebPHasAlpha8b)(const uint8_t* src, int length); int (*WebPHasAlpha32b)(const uint8_t* src, int length); +void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color); //------------------------------------------------------------------------------ // Init function @@ -428,13 +449,14 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) { WebPHasAlpha8b = HasAlpha8b_C; WebPHasAlpha32b = HasAlpha32b_C; + WebPAlphaReplace = AlphaReplace_C; // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitAlphaProcessingSSE2(); -#if defined(WEBP_USE_SSE41) +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { WebPInitAlphaProcessingSSE41(); } @@ -448,7 +470,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { WebPInitAlphaProcessingNEON(); @@ -469,4 +491,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) { assert(WebPPackRGB != NULL); assert(WebPHasAlpha8b != NULL); assert(WebPHasAlpha32b != NULL); + assert(WebPAlphaReplace != NULL); } diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_neon.c b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c index 9d55421704..9e0ace9421 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_neon.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c @@ -80,9 +80,9 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first, //------------------------------------------------------------------------------ -static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint8_t* dst, int dst_stride) { +static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha, + int alpha_stride, int width, int height, + uint8_t* WEBP_RESTRICT dst, int dst_stride) { uint32_t alpha_mask = 0xffffffffu; uint8x8_t mask8 = vdup_n_u8(0xff); uint32_t tmp[2]; @@ -112,9 +112,10 @@ static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride, return (alpha_mask != 0xffffffffu); } -static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint32_t* dst, int dst_stride) { +static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha, + int alpha_stride, int width, int height, + uint32_t* WEBP_RESTRICT dst, + int dst_stride) { int i, j; uint8x8x4_t greens; // leave A/R/B channels zero'd. greens.val[0] = vdup_n_u8(0); @@ -131,9 +132,9 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride, } } -static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride, +static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride, int width, int height, - uint8_t* alpha, int alpha_stride) { + uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { uint32_t alpha_mask = 0xffffffffu; uint8x8_t mask8 = vdup_n_u8(0xff); uint32_t tmp[2]; @@ -161,8 +162,8 @@ static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride, return (alpha_mask == 0xffffffffu); } -static void ExtractGreen_NEON(const uint32_t* argb, - uint8_t* alpha, int size) { +static void ExtractGreen_NEON(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT alpha, int size) { int i; for (i = 0; i + 16 <= size; i += 16) { const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i)); diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c index 2871c56d84..a5f8c9f7c7 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c @@ -18,9 +18,9 @@ //------------------------------------------------------------------------------ -static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint8_t* dst, int dst_stride) { +static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, + int alpha_stride, int width, int height, + uint8_t* WEBP_RESTRICT dst, int dst_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -72,9 +72,10 @@ static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride, return (alpha_and != 0xff); } -static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint32_t* dst, int dst_stride) { +static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha, + int alpha_stride, int width, int height, + uint32_t* WEBP_RESTRICT dst, + int dst_stride) { int i, j; const __m128i zero = _mm_setzero_si128(); const int limit = width & ~15; @@ -98,9 +99,9 @@ static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride, } } -static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride, +static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, int width, int height, - uint8_t* alpha, int alpha_stride) { + uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; @@ -265,6 +266,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) { return 0; } +static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) { + const __m128i m_color = _mm_set1_epi32(color); + const __m128i zero = _mm_setzero_si128(); + int i = 0; + for (; i + 8 <= length; i += 8) { + const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); + const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4)); + const __m128i b0 = _mm_srai_epi32(a0, 24); + const __m128i b1 = _mm_srai_epi32(a1, 24); + const __m128i c0 = _mm_cmpeq_epi32(b0, zero); + const __m128i c1 = _mm_cmpeq_epi32(b1, zero); + const __m128i d0 = _mm_and_si128(c0, m_color); + const __m128i d1 = _mm_and_si128(c1, m_color); + const __m128i e0 = _mm_andnot_si128(c0, a0); + const __m128i e1 = _mm_andnot_si128(c1, a1); + _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0)); + _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1)); + } + for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color; +} + // ----------------------------------------------------------------------------- // Apply alpha value to rows @@ -296,7 +318,8 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse); } -static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha, +static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr, + const uint8_t* WEBP_RESTRICT const alpha, int width, int inverse) { int x = 0; if (!inverse) { @@ -334,6 +357,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { WebPHasAlpha8b = HasAlpha8b_SSE2; WebPHasAlpha32b = HasAlpha32b_SSE2; + WebPAlphaReplace = AlphaReplace_SSE2; } #else // !WEBP_USE_SSE2 diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c index 56040f9c88..cdf877ce49 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c @@ -19,9 +19,9 @@ //------------------------------------------------------------------------------ -static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride, - int width, int height, - uint8_t* alpha, int alpha_stride) { +static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb, + int argb_stride, int width, int height, + uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; diff --git a/thirdparty/libwebp/src/dsp/cost.c b/thirdparty/libwebp/src/dsp/cost.c index cc681cdd4b..460ec4f2a7 100644 --- a/thirdparty/libwebp/src/dsp/cost.c +++ b/thirdparty/libwebp/src/dsp/cost.c @@ -395,12 +395,12 @@ WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) { VP8EncDspCostInitMIPSdspR2(); } #endif -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8EncDspCostInitSSE2(); } #endif -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (VP8GetCPUInfo(kNEON)) { VP8EncDspCostInitNEON(); } diff --git a/thirdparty/libwebp/src/dsp/cpu.c b/thirdparty/libwebp/src/dsp/cpu.c index 0fa5b6a5ce..3145e190a4 100644 --- a/thirdparty/libwebp/src/dsp/cpu.c +++ b/thirdparty/libwebp/src/dsp/cpu.c @@ -55,12 +55,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) { : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type), "c"(0)); } -#elif (defined(_M_X64) || defined(_M_IX86)) && \ - defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1 +#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + +#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1 #include <intrin.h> #define GetCPUInfo(info, type) __cpuidex(info, type, 0) // set ecx=0 -#elif defined(WEBP_MSC_SSE2) +#define WEBP_HAVE_MSC_CPUID +#elif _MSC_VER > 1310 +#include <intrin.h> #define GetCPUInfo __cpuid +#define WEBP_HAVE_MSC_CPUID +#endif + #endif // NaCl has no support for xgetbv or the raw opcode. @@ -94,7 +100,7 @@ static WEBP_INLINE uint64_t xgetbv(void) { #define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains. #endif -#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2) +#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID) // helper function for run-time detection of slow SSSE3 platforms static int CheckSlowModel(int info) { @@ -179,9 +185,34 @@ static int AndroidCPUInfo(CPUFeature feature) { return 0; } VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo; -#elif defined(WEBP_USE_NEON) -// define a dummy function to enable turning off NEON at runtime by setting -// VP8DecGetCPUInfo = NULL +#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test +// Use compile flags as an indicator of SIMD support instead of a runtime check. +static int wasmCPUInfo(CPUFeature feature) { + switch (feature) { +#ifdef WEBP_HAVE_SSE2 + case kSSE2: + return 1; +#endif +#ifdef WEBP_HAVE_SSE41 + case kSSE3: + case kSlowSSSE3: + case kSSE4_1: + return 1; +#endif +#ifdef WEBP_HAVE_NEON + case kNEON: + return 1; +#endif + default: + break; + } + return 0; +} +VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo; +#elif defined(WEBP_HAVE_NEON) +// In most cases this function doesn't check for NEON support (it's assumed by +// the configuration), but enables turning off NEON at runtime, for testing +// purposes, by setting VP8DecGetCPUInfo = NULL. static int armCPUInfo(CPUFeature feature) { if (feature != kNEON) return 0; #if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD) diff --git a/thirdparty/libwebp/src/dsp/dec.c b/thirdparty/libwebp/src/dsp/dec.c index 1119842dd3..537c701282 100644 --- a/thirdparty/libwebp/src/dsp/dec.c +++ b/thirdparty/libwebp/src/dsp/dec.c @@ -807,10 +807,10 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) { // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8DspInitSSE2(); -#if defined(WEBP_USE_SSE41) +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { VP8DspInitSSE41(); } @@ -834,7 +834,7 @@ WEBP_DSP_INIT_FUNC(VP8DspInit) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { VP8DspInitNEON(); diff --git a/thirdparty/libwebp/src/dsp/dec_neon.c b/thirdparty/libwebp/src/dsp/dec_neon.c index 239ec4167e..fa851707e2 100644 --- a/thirdparty/libwebp/src/dsp/dec_neon.c +++ b/thirdparty/libwebp/src/dsp/dec_neon.c @@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) { // DC const uint8x8_t A = vld1_u8(dst - BPS); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); - const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); - const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); - const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); - const uint16x8_t s0 = vaddq_u16(L0, L1); - const uint16x8_t s1 = vaddq_u16(L2, L3); + const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1); + const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1); + const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1); + const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1); + const uint16x8_t s0 = vaddl_u8(L0, L1); + const uint16x8_t s1 = vaddl_u8(L2, L3); const uint16x8_t s01 = vaddq_u16(s0, s1); const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1)); const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3 @@ -1429,8 +1429,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) { if (do_top) { const uint8x8_t A = vld1_u8(dst - BPS); // top row #if defined(__aarch64__) - const uint16x8_t B = vmovl_u8(A); - const uint16_t p2 = vaddvq_u16(B); + const uint16_t p2 = vaddlv_u8(A); sum_top = vdupq_n_u16(p2); #else const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top @@ -1441,18 +1440,18 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) { } if (do_left) { - const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1)); - const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1)); - const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1)); - const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1)); - const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1)); - const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1)); - const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1)); - const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1)); - const uint16x8_t s0 = vaddq_u16(L0, L1); - const uint16x8_t s1 = vaddq_u16(L2, L3); - const uint16x8_t s2 = vaddq_u16(L4, L5); - const uint16x8_t s3 = vaddq_u16(L6, L7); + const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1); + const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1); + const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1); + const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1); + const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1); + const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1); + const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1); + const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1); + const uint16x8_t s0 = vaddl_u8(L0, L1); + const uint16x8_t s1 = vaddl_u8(L2, L3); + const uint16x8_t s2 = vaddl_u8(L4, L5); + const uint16x8_t s3 = vaddl_u8(L6, L7); const uint16x8_t s01 = vaddq_u16(s0, s1); const uint16x8_t s23 = vaddq_u16(s2, s3); sum_left = vaddq_u16(s01, s23); @@ -1512,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) { if (do_top) { const uint8x16_t A = vld1q_u8(dst - BPS); // top row +#if defined(__aarch64__) + const uint16_t p3 = vaddlvq_u8(A); + sum_top = vdupq_n_u16(p3); +#else const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); +#endif } if (do_left) { int i; sum_left = vdupq_n_u16(0); for (i = 0; i < 16; i += 8) { - const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1)); - const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1)); - const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1)); - const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1)); - const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1)); - const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1)); - const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1)); - const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1)); - const uint16x8_t s0 = vaddq_u16(L0, L1); - const uint16x8_t s1 = vaddq_u16(L2, L3); - const uint16x8_t s2 = vaddq_u16(L4, L5); - const uint16x8_t s3 = vaddq_u16(L6, L7); + const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1); + const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1); + const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1); + const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1); + const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1); + const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1); + const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1); + const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1); + const uint16x8_t s0 = vaddl_u8(L0, L1); + const uint16x8_t s1 = vaddl_u8(L2, L3); + const uint16x8_t s2 = vaddl_u8(L4, L5); + const uint16x8_t s3 = vaddl_u8(L6, L7); const uint16x8_t s01 = vaddq_u16(s0, s1); const uint16x8_t s23 = vaddq_u16(s2, s3); const uint16x8_t sum = vaddq_u16(s01, s23); diff --git a/thirdparty/libwebp/src/dsp/dsp.h b/thirdparty/libwebp/src/dsp/dsp.h index a784de334a..513e159bb3 100644 --- a/thirdparty/libwebp/src/dsp/dsp.h +++ b/thirdparty/libwebp/src/dsp/dsp.h @@ -27,6 +27,23 @@ extern "C" { #define BPS 32 // this is the common stride for enc/dec //------------------------------------------------------------------------------ +// WEBP_RESTRICT + +// Declares a pointer with the restrict type qualifier if available. +// This allows code to hint to the compiler that only this pointer references a +// particular object or memory region within the scope of the block in which it +// is declared. This may allow for improved optimizations due to the lack of +// pointer aliasing. See also: +// https://en.cppreference.com/w/c/language/restrict +#if defined(__GNUC__) +#define WEBP_RESTRICT __restrict__ +#elif defined(_MSC_VER) +#define WEBP_RESTRICT __restrict +#else +#define WEBP_RESTRICT +#endif + +//------------------------------------------------------------------------------ // CPU detection #if defined(__GNUC__) @@ -51,9 +68,7 @@ extern "C" { # define __has_builtin(x) 0 #endif -// for now, none of the optimizations below are available in emscripten -#if !defined(EMSCRIPTEN) - +#if !defined(HAVE_CONFIG_H) #if defined(_MSC_VER) && _MSC_VER > 1310 && \ (defined(_M_X64) || defined(_M_IX86)) #define WEBP_MSC_SSE2 // Visual C++ SSE2 targets @@ -63,23 +78,37 @@ extern "C" { (defined(_M_X64) || defined(_M_IX86)) #define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets #endif +#endif // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp // files without intrinsics, allowing the corresponding Init() to be called. // Files containing intrinsics will need to be built targeting the instruction // set so should succeed on one of the earlier tests. -#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2) +#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2)) #define WEBP_USE_SSE2 #endif -#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41) +#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2) +#define WEBP_HAVE_SSE2 +#endif + +#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41)) #define WEBP_USE_SSE41 #endif +#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41) +#define WEBP_HAVE_SSE41 +#endif + +#undef WEBP_MSC_SSE41 +#undef WEBP_MSC_SSE2 + // The intrinsics currently cause compiler errors with arm-nacl-gcc and the // inline assembly would need to be modified for use with Native Client. -#if (defined(__ARM_NEON__) || \ - defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \ +#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \ !defined(__native_client__) #define WEBP_USE_NEON #endif @@ -95,6 +124,10 @@ extern "C" { #define WEBP_USE_INTRINSICS #endif +#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) +#define WEBP_HAVE_NEON +#endif + #if defined(__mips__) && !defined(__mips64) && \ defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6) #define WEBP_USE_MIPS32 @@ -110,13 +143,11 @@ extern "C" { #define WEBP_USE_MSA #endif -#endif /* EMSCRIPTEN */ - #ifndef WEBP_DSP_OMIT_C_CODE #define WEBP_DSP_OMIT_C_CODE 1 #endif -#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE +#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE #define WEBP_NEON_OMIT_C_CODE 1 #else #define WEBP_NEON_OMIT_C_CODE 0 @@ -193,6 +224,12 @@ extern "C" { #endif #endif +// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'. +// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning. +#if !defined(WEBP_OFFSET_PTR) +#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off))) +#endif + // Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility) #if !defined(WEBP_SWAP_16BIT_CSP) #define WEBP_SWAP_16BIT_CSP 0 @@ -572,26 +609,29 @@ extern void (*WebPApplyAlphaMultiply4444)( // Dispatch the values from alpha[] plane to the ARGB destination 'dst'. // Returns true if alpha[] plane has non-trivial values different from 0xff. -extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint8_t* dst, int dst_stride); +extern int (*WebPDispatchAlpha)(const uint8_t* WEBP_RESTRICT alpha, + int alpha_stride, int width, int height, + uint8_t* WEBP_RESTRICT dst, int dst_stride); // Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the // A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units. -extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride, - int width, int height, - uint32_t* dst, int dst_stride); +extern void (*WebPDispatchAlphaToGreen)(const uint8_t* WEBP_RESTRICT alpha, + int alpha_stride, int width, int height, + uint32_t* WEBP_RESTRICT dst, + int dst_stride); // Extract the alpha values from 32b values in argb[] and pack them into alpha[] // (this is the opposite of WebPDispatchAlpha). // Returns true if there's only trivial 0xff alpha values. -extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride, - int width, int height, - uint8_t* alpha, int alpha_stride); +extern int (*WebPExtractAlpha)(const uint8_t* WEBP_RESTRICT argb, + int argb_stride, int width, int height, + uint8_t* WEBP_RESTRICT alpha, + int alpha_stride); // Extract the green values from 32b values in argb[] and pack them into alpha[] // (this is the opposite of WebPDispatchAlphaToGreen). -extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size); +extern void (*WebPExtractGreen)(const uint32_t* WEBP_RESTRICT argb, + uint8_t* WEBP_RESTRICT alpha, int size); // Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B). // Un-Multiply operation transforms x into x * 255 / A. @@ -604,34 +644,42 @@ void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows, int inverse); // Same for a row of single values, with side alpha values. -extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha, +extern void (*WebPMultRow)(uint8_t* WEBP_RESTRICT const ptr, + const uint8_t* WEBP_RESTRICT const alpha, int width, int inverse); // Same a WebPMultRow(), but for several 'num_rows' rows. -void WebPMultRows(uint8_t* ptr, int stride, - const uint8_t* alpha, int alpha_stride, +void WebPMultRows(uint8_t* WEBP_RESTRICT ptr, int stride, + const uint8_t* WEBP_RESTRICT alpha, int alpha_stride, int width, int num_rows, int inverse); // Plain-C versions, used as fallback by some implementations. -void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha, +void WebPMultRow_C(uint8_t* WEBP_RESTRICT const ptr, + const uint8_t* WEBP_RESTRICT const alpha, int width, int inverse); void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse); #ifdef WORDS_BIGENDIAN // ARGB packing function: a/r/g/b input is rgba or bgra order. -extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, - const uint8_t* g, const uint8_t* b, int len, - uint32_t* out); +extern void (*WebPPackARGB)(const uint8_t* WEBP_RESTRICT a, + const uint8_t* WEBP_RESTRICT r, + const uint8_t* WEBP_RESTRICT g, + const uint8_t* WEBP_RESTRICT b, + int len, uint32_t* WEBP_RESTRICT out); #endif // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order. -extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b, - int len, int step, uint32_t* out); +extern void (*WebPPackRGB)(const uint8_t* WEBP_RESTRICT r, + const uint8_t* WEBP_RESTRICT g, + const uint8_t* WEBP_RESTRICT b, + int len, int step, uint32_t* WEBP_RESTRICT out); // This function returns true if src[i] contains a value different from 0xff. extern int (*WebPHasAlpha8b)(const uint8_t* src, int length); // This function returns true if src[4*i] contains a value different from 0xff. extern int (*WebPHasAlpha32b)(const uint8_t* src, int length); +// replaces transparent values in src[] by 'color'. +extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color); // To be called first before using the above. void WebPInitAlphaProcessing(void); diff --git a/thirdparty/libwebp/src/dsp/enc.c b/thirdparty/libwebp/src/dsp/enc.c index 2fddbc4c52..ea47a3fd95 100644 --- a/thirdparty/libwebp/src/dsp/enc.c +++ b/thirdparty/libwebp/src/dsp/enc.c @@ -773,10 +773,10 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8EncDspInitSSE2(); -#if defined(WEBP_USE_SSE41) +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { VP8EncDspInitSSE41(); } @@ -800,7 +800,7 @@ WEBP_DSP_INIT_FUNC(VP8EncDspInit) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { VP8EncDspInitNEON(); diff --git a/thirdparty/libwebp/src/dsp/filters.c b/thirdparty/libwebp/src/dsp/filters.c index 9e910d99c9..4506567ba3 100644 --- a/thirdparty/libwebp/src/dsp/filters.c +++ b/thirdparty/libwebp/src/dsp/filters.c @@ -254,7 +254,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) { #endif if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8FiltersInitSSE2(); } @@ -271,7 +271,7 @@ WEBP_DSP_INIT_FUNC(VP8FiltersInit) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { VP8FiltersInitNEON(); diff --git a/thirdparty/libwebp/src/dsp/filters_sse2.c b/thirdparty/libwebp/src/dsp/filters_sse2.c index 4b3f2d020f..5c33ec15e2 100644 --- a/thirdparty/libwebp/src/dsp/filters_sse2.c +++ b/thirdparty/libwebp/src/dsp/filters_sse2.c @@ -320,7 +320,12 @@ extern void VP8FiltersInitSSE2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) { WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2; +#if defined(CHROMIUM) + // TODO(crbug.com/654974) + (void)VerticalUnfilter_SSE2; +#else WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2; +#endif WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2; WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2; diff --git a/thirdparty/libwebp/src/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c index aad5f43ec9..d8bbb02b35 100644 --- a/thirdparty/libwebp/src/dsp/lossless.c +++ b/thirdparty/libwebp/src/dsp/lossless.c @@ -107,62 +107,62 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { //------------------------------------------------------------------------------ // Predictors -static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) { (void)top; (void)left; return ARGB_BLACK; } -static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) { (void)top; return left; } -static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) { (void)left; return top[0]; } -static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) { (void)left; return top[1]; } -static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) { (void)left; return top[-1]; } -static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average3(left, top[0], top[1]); return pred; } -static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(left, top[-1]); return pred; } -static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(left, top[0]); return pred; } -static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(top[-1], top[0]); (void)left; return pred; } -static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average2(top[0], top[1]); (void)left; return pred; } -static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Average4(left, top[-1], top[0], top[1]); return pred; } -static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = Select(top[0], left, top[-1]); return pred; } -static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); return pred; } -static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) { +uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) { const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); return pred; } @@ -182,18 +182,18 @@ static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper, out[i] = left = VP8LAddPixels(in[i], left); } } -GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C) -GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C) -GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C) -GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C) -GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C) -GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C) -GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C) -GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C) -GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C) -GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C) -GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C) -GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C) +GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C) //------------------------------------------------------------------------------ @@ -562,7 +562,6 @@ VP8LPredictorFunc VP8LPredictors[16]; // exposed plain-C implementations VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16]; -VP8LPredictorFunc VP8LPredictors_C[16]; VP8LTransformColorInverseFunc VP8LTransformColorInverse; @@ -576,6 +575,7 @@ VP8LMapARGBFunc VP8LMapColor32b; VP8LMapAlphaFunc VP8LMapColor8b; extern void VP8LDspInitSSE2(void); +extern void VP8LDspInitSSE41(void); extern void VP8LDspInitNEON(void); extern void VP8LDspInitMIPSdspR2(void); extern void VP8LDspInitMSA(void); @@ -600,8 +600,7 @@ extern void VP8LDspInitMSA(void); } while (0); WEBP_DSP_INIT_FUNC(VP8LDspInit) { - COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors) - COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C) + COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors) COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd) COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C) @@ -623,9 +622,14 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) { // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8LDspInitSSE2(); +#if defined(WEBP_HAVE_SSE41) + if (VP8GetCPUInfo(kSSE4_1)) { + VP8LDspInitSSE41(); + } +#endif } #endif #if defined(WEBP_USE_MIPS_DSP_R2) @@ -640,7 +644,7 @@ WEBP_DSP_INIT_FUNC(VP8LDspInit) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { VP8LDspInitNEON(); diff --git a/thirdparty/libwebp/src/dsp/lossless.h b/thirdparty/libwebp/src/dsp/lossless.h index f709cc86b2..ebd316d1ed 100644 --- a/thirdparty/libwebp/src/dsp/lossless.h +++ b/thirdparty/libwebp/src/dsp/lossless.h @@ -30,7 +30,22 @@ extern "C" { typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top); extern VP8LPredictorFunc VP8LPredictors[16]; -extern VP8LPredictorFunc VP8LPredictors_C[16]; + +uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top); +uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top); + // These Add/Sub function expects upper[-1] and out[-1] to be readable. typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in, const uint32_t* upper, int num_pixels, diff --git a/thirdparty/libwebp/src/dsp/lossless_common.h b/thirdparty/libwebp/src/dsp/lossless_common.h index 9c2ebe6809..96a106f9ee 100644 --- a/thirdparty/libwebp/src/dsp/lossless_common.h +++ b/thirdparty/libwebp/src/dsp/lossless_common.h @@ -184,19 +184,6 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \ } \ } -// It subtracts the prediction from the input pixel and stores the residual -// in the output pixel. -#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB) \ -static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \ - int num_pixels, uint32_t* out) { \ - int x; \ - assert(upper != NULL); \ - for (x = 0; x < num_pixels; ++x) { \ - const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x); \ - out[x] = VP8LSubPixels(in[x], pred); \ - } \ -} - #ifdef __cplusplus } // extern "C" #endif diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c index 9c36055afc..c3e8537ade 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc.c @@ -329,6 +329,15 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = { static float FastSLog2Slow_C(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { +#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ) + // use clz if available + const int log_cnt = BitsLog2Floor(v) - 7; + const uint32_t y = 1 << log_cnt; + int correction = 0; + const float v_f = (float)v; + const uint32_t orig_v = v; + v >>= log_cnt; +#else int log_cnt = 0; uint32_t y = 1; int correction = 0; @@ -339,6 +348,7 @@ static float FastSLog2Slow_C(uint32_t v) { v = v >> 1; y = y << 1; } while (v >= LOG_LOOKUP_IDX_MAX); +#endif // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 // Xf = floor(Xf) * (1 + (v % y) / v) // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) @@ -355,6 +365,14 @@ static float FastSLog2Slow_C(uint32_t v) { static float FastLog2Slow_C(uint32_t v) { assert(v >= LOG_LOOKUP_IDX_MAX); if (v < APPROX_LOG_WITH_CORRECTION_MAX) { +#if !defined(WEBP_HAVE_SLOW_CLZ_CTZ) + // use clz if available + const int log_cnt = BitsLog2Floor(v) - 7; + const uint32_t y = 1 << log_cnt; + const uint32_t orig_v = v; + double log_2; + v >>= log_cnt; +#else int log_cnt = 0; uint32_t y = 1; const uint32_t orig_v = v; @@ -364,6 +382,7 @@ static float FastLog2Slow_C(uint32_t v) { v = v >> 1; y = y << 1; } while (v >= LOG_LOOKUP_IDX_MAX); +#endif log_2 = kLog2Table[v] + log_cnt; if (orig_v >= APPROX_LOG_MAX) { // Since the division is still expensive, add this correction factor only @@ -702,140 +721,6 @@ void VP8LHistogramAdd(const VP8LHistogram* const a, //------------------------------------------------------------------------------ // Image transforms. -static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { - return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1); -} - -static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { - return Average2(Average2(a0, a2), a1); -} - -static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, - uint32_t a2, uint32_t a3) { - return Average2(Average2(a0, a1), Average2(a2, a3)); -} - -static WEBP_INLINE uint32_t Clip255(uint32_t a) { - if (a < 256) { - return a; - } - // return 0, when a is a negative integer. - // return 255, when a is positive. - return ~a >> 24; -} - -static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) { - return Clip255(a + b - c); -} - -static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, - uint32_t c2) { - const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24); - const int r = AddSubtractComponentFull((c0 >> 16) & 0xff, - (c1 >> 16) & 0xff, - (c2 >> 16) & 0xff); - const int g = AddSubtractComponentFull((c0 >> 8) & 0xff, - (c1 >> 8) & 0xff, - (c2 >> 8) & 0xff); - const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff); - return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b; -} - -static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) { - return Clip255(a + (a - b) / 2); -} - -static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, - uint32_t c2) { - const uint32_t ave = Average2(c0, c1); - const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24); - const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff); - const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff); - const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff); - return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b; -} - -// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined. -#if defined(__arm__) && \ - (LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408) -# define LOCAL_INLINE __attribute__ ((noinline)) -#else -# define LOCAL_INLINE WEBP_INLINE -#endif - -static LOCAL_INLINE int Sub3(int a, int b, int c) { - const int pb = b - c; - const int pa = a - c; - return abs(pb) - abs(pa); -} - -#undef LOCAL_INLINE - -static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) { - const int pa_minus_pb = - Sub3((a >> 24) , (b >> 24) , (c >> 24) ) + - Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) + - Sub3((a >> 8) & 0xff, (b >> 8) & 0xff, (c >> 8) & 0xff) + - Sub3((a ) & 0xff, (b ) & 0xff, (c ) & 0xff); - return (pa_minus_pb <= 0) ? a : b; -} - -//------------------------------------------------------------------------------ -// Predictors - -static uint32_t Predictor2(uint32_t left, const uint32_t* const top) { - (void)left; - return top[0]; -} -static uint32_t Predictor3(uint32_t left, const uint32_t* const top) { - (void)left; - return top[1]; -} -static uint32_t Predictor4(uint32_t left, const uint32_t* const top) { - (void)left; - return top[-1]; -} -static uint32_t Predictor5(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average3(left, top[0], top[1]); - return pred; -} -static uint32_t Predictor6(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(left, top[-1]); - return pred; -} -static uint32_t Predictor7(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(left, top[0]); - return pred; -} -static uint32_t Predictor8(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(top[-1], top[0]); - (void)left; - return pred; -} -static uint32_t Predictor9(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average2(top[0], top[1]); - (void)left; - return pred; -} -static uint32_t Predictor10(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Average4(left, top[-1], top[0], top[1]); - return pred; -} -static uint32_t Predictor11(uint32_t left, const uint32_t* const top) { - const uint32_t pred = Select(top[0], left, top[-1]); - return pred; -} -static uint32_t Predictor12(uint32_t left, const uint32_t* const top) { - const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]); - return pred; -} -static uint32_t Predictor13(uint32_t left, const uint32_t* const top) { - const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]); - return pred; -} - -//------------------------------------------------------------------------------ - static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; @@ -850,18 +735,33 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper, (void)upper; } -GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C) -GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C) -GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C) -GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C) -GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C) -GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C) -GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C) -GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C) -GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C) -GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C) -GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C) -GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C) +// It subtracts the prediction from the input pixel and stores the residual +// in the output pixel. +#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \ +static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \ + const uint32_t* upper, \ + int num_pixels, uint32_t* out) { \ + int x; \ + assert(upper != NULL); \ + for (x = 0; x < num_pixels; ++x) { \ + const uint32_t pred = \ + VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x); \ + out[x] = VP8LSubPixels(in[x], pred); \ + } \ +} + +GENERATE_PREDICTOR_SUB(2) +GENERATE_PREDICTOR_SUB(3) +GENERATE_PREDICTOR_SUB(4) +GENERATE_PREDICTOR_SUB(5) +GENERATE_PREDICTOR_SUB(6) +GENERATE_PREDICTOR_SUB(7) +GENERATE_PREDICTOR_SUB(8) +GENERATE_PREDICTOR_SUB(9) +GENERATE_PREDICTOR_SUB(10) +GENERATE_PREDICTOR_SUB(11) +GENERATE_PREDICTOR_SUB(12) +GENERATE_PREDICTOR_SUB(13) //------------------------------------------------------------------------------ @@ -962,10 +862,10 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) { // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8LEncDspInitSSE2(); -#if defined(WEBP_USE_SSE41) +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { VP8LEncDspInitSSE41(); } @@ -989,7 +889,7 @@ WEBP_DSP_INIT_FUNC(VP8LEncDspInit) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { VP8LEncDspInitNEON(); diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c index e676f6fdc9..b2f83b871c 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c @@ -232,76 +232,55 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) { //------------------------------------------------------------------------------ // Entropy -// Checks whether the X or Y contribution is worth computing and adding. -// Used in loop unrolling. -#define ANALYZE_X_OR_Y(x_or_y, j) \ - do { \ - if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \ - } while (0) - -// Checks whether the X + Y contribution is worth computing and adding. -// Used in loop unrolling. -#define ANALYZE_XY(j) \ - do { \ - if (tmp[j] != 0) { \ - retval -= VP8LFastSLog2(tmp[j]); \ - ANALYZE_X_OR_Y(X, j); \ - } \ - } while (0) +// TODO(https://crbug.com/webp/499): this function produces different results +// from the C code due to use of double/float resulting in output differences +// when compared to -noasm. +#if !(defined(WEBP_HAVE_SLOW_CLZ_CTZ) || defined(__i386__) || defined(_M_IX86)) static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) { int i; double retval = 0.; - int sumX, sumXY; - int32_t tmp[4]; - __m128i zero = _mm_setzero_si128(); - // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY). - __m128i sumXY_128 = zero; - __m128i sumX_128 = zero; - - for (i = 0; i < 256; i += 4) { - const __m128i x = _mm_loadu_si128((const __m128i*)(X + i)); - const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i)); - - // Check if any X is non-zero: this actually provides a speedup as X is - // usually sparse. - if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) { - const __m128i xy_128 = _mm_add_epi32(x, y); - sumXY_128 = _mm_add_epi32(sumXY_128, xy_128); - - sumX_128 = _mm_add_epi32(sumX_128, x); - - // Analyze the different X + Y. - _mm_storeu_si128((__m128i*)tmp, xy_128); - - ANALYZE_XY(0); - ANALYZE_XY(1); - ANALYZE_XY(2); - ANALYZE_XY(3); - } else { - // X is fully 0, so only deal with Y. - sumXY_128 = _mm_add_epi32(sumXY_128, y); - - ANALYZE_X_OR_Y(Y, 0); - ANALYZE_X_OR_Y(Y, 1); - ANALYZE_X_OR_Y(Y, 2); - ANALYZE_X_OR_Y(Y, 3); + int sumX = 0, sumXY = 0; + const __m128i zero = _mm_setzero_si128(); + + for (i = 0; i < 256; i += 16) { + const __m128i x0 = _mm_loadu_si128((const __m128i*)(X + i + 0)); + const __m128i y0 = _mm_loadu_si128((const __m128i*)(Y + i + 0)); + const __m128i x1 = _mm_loadu_si128((const __m128i*)(X + i + 4)); + const __m128i y1 = _mm_loadu_si128((const __m128i*)(Y + i + 4)); + const __m128i x2 = _mm_loadu_si128((const __m128i*)(X + i + 8)); + const __m128i y2 = _mm_loadu_si128((const __m128i*)(Y + i + 8)); + const __m128i x3 = _mm_loadu_si128((const __m128i*)(X + i + 12)); + const __m128i y3 = _mm_loadu_si128((const __m128i*)(Y + i + 12)); + const __m128i x4 = _mm_packs_epi16(_mm_packs_epi32(x0, x1), + _mm_packs_epi32(x2, x3)); + const __m128i y4 = _mm_packs_epi16(_mm_packs_epi32(y0, y1), + _mm_packs_epi32(y2, y3)); + const int32_t mx = _mm_movemask_epi8(_mm_cmpgt_epi8(x4, zero)); + int32_t my = _mm_movemask_epi8(_mm_cmpgt_epi8(y4, zero)) | mx; + while (my) { + const int32_t j = BitsCtz(my); + int xy; + if ((mx >> j) & 1) { + const int x = X[i + j]; + sumXY += x; + retval -= VP8LFastSLog2(x); + } + xy = X[i + j] + Y[i + j]; + sumX += xy; + retval -= VP8LFastSLog2(xy); + my &= my - 1; } } - - // Sum up sumX_128 to get sumX. - _mm_storeu_si128((__m128i*)tmp, sumX_128); - sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0]; - - // Sum up sumXY_128 to get sumXY. - _mm_storeu_si128((__m128i*)tmp, sumXY_128); - sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0]; - retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); return (float)retval; } -#undef ANALYZE_X_OR_Y -#undef ANALYZE_XY + +#else + +#define DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC // won't be faster + +#endif //------------------------------------------------------------------------------ @@ -460,20 +439,22 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, (void)upper; } -#define GENERATE_PREDICTOR_1(X, IN) \ -static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \ - int num_pixels, uint32_t* out) { \ - int i; \ - for (i = 0; i + 4 <= num_pixels; i += 4) { \ - const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ - const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \ - const __m128i res = _mm_sub_epi8(src, pred); \ - _mm_storeu_si128((__m128i*)&out[i], res); \ - } \ - if (i != num_pixels) { \ - VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \ - } \ -} +#define GENERATE_PREDICTOR_1(X, IN) \ + static void PredictorSub##X##_SSE2(const uint32_t* const in, \ + const uint32_t* const upper, \ + int num_pixels, uint32_t* const out) { \ + int i; \ + for (i = 0; i + 4 <= num_pixels; i += 4) { \ + const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \ + const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \ + const __m128i res = _mm_sub_epi8(src, pred); \ + _mm_storeu_si128((__m128i*)&out[i], res); \ + } \ + if (i != num_pixels) { \ + VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \ + num_pixels - i, out + i); \ + } \ + } GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T @@ -657,7 +638,9 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2; VP8LAddVector = AddVector_SSE2; VP8LAddVectorEq = AddVectorEq_SSE2; +#if !defined(DONT_USE_COMBINED_SHANNON_ENTROPY_SSE2_FUNC) VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2; +#endif VP8LVectorMismatch = VectorMismatch_SSE2; VP8LBundleColorMap = BundleColorMap_SSE2; diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c index 719d8ed25e..ad358a6f25 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c @@ -44,46 +44,47 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data, //------------------------------------------------------------------------------ // Color Transform -#define SPAN 8 +#define MK_CST_16(HI, LO) \ + _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) + static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, int histo[]) { - const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue)); - const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue)); - const __m128i mask_g = _mm_set1_epi16((short)0xff00); // green mask - const __m128i mask_gb = _mm_set1_epi32(0xffff); // green/blue mask - const __m128i mask_b = _mm_set1_epi16(0x00ff); // blue mask - const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1, - -1, -1, -1, -1, -1, -1, -1); - const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, - 2, -1, 6, -1, 10, -1, 14); - int y; - for (y = 0; y < tile_height; ++y) { - const uint32_t* const src = argb + y * stride; - int i, x; - for (x = 0; x + SPAN <= tile_width; x += SPAN) { - uint16_t values[SPAN]; - const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); - const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); - const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo); - const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi); - const __m128i r = _mm_or_si128(r0, r1); // r 0 - const __m128i gb0 = _mm_and_si128(in0, mask_gb); - const __m128i gb1 = _mm_and_si128(in1, mask_gb); - const __m128i gb = _mm_packus_epi32(gb0, gb1); // g b - const __m128i g = _mm_and_si128(gb, mask_g); // g 0 - const __m128i A = _mm_mulhi_epi16(r, mults_r); // x dbr - const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dbg - const __m128i C = _mm_sub_epi8(gb, B); // x b' - const __m128i D = _mm_sub_epi8(C, A); // x b'' - const __m128i E = _mm_and_si128(D, mask_b); // 0 b'' - _mm_storeu_si128((__m128i*)values, E); - for (i = 0; i < SPAN; ++i) ++histo[values[i]]; + const __m128i mult = + MK_CST_16(CST_5b(red_to_blue) + 256,CST_5b(green_to_blue)); + const __m128i perm = + _mm_setr_epi8(-1, 1, -1, 2, -1, 5, -1, 6, -1, 9, -1, 10, -1, 13, -1, 14); + if (tile_width >= 4) { + int y; + for (y = 0; y < tile_height; ++y) { + const uint32_t* const src = argb + y * stride; + const __m128i A1 = _mm_loadu_si128((const __m128i*)src); + const __m128i B1 = _mm_shuffle_epi8(A1, perm); + const __m128i C1 = _mm_mulhi_epi16(B1, mult); + const __m128i D1 = _mm_sub_epi16(A1, C1); + __m128i E = _mm_add_epi16(_mm_srli_epi32(D1, 16), D1); + int x; + for (x = 4; x + 4 <= tile_width; x += 4) { + const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i B2, C2, D2; + ++histo[_mm_extract_epi8(E, 0)]; + B2 = _mm_shuffle_epi8(A2, perm); + ++histo[_mm_extract_epi8(E, 4)]; + C2 = _mm_mulhi_epi16(B2, mult); + ++histo[_mm_extract_epi8(E, 8)]; + D2 = _mm_sub_epi16(A2, C2); + ++histo[_mm_extract_epi8(E, 12)]; + E = _mm_add_epi16(_mm_srli_epi32(D2, 16), D2); + } + ++histo[_mm_extract_epi8(E, 0)]; + ++histo[_mm_extract_epi8(E, 4)]; + ++histo[_mm_extract_epi8(E, 8)]; + ++histo[_mm_extract_epi8(E, 12)]; } } { - const int left_over = tile_width & (SPAN - 1); + const int left_over = tile_width & 3; if (left_over > 0) { VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, @@ -95,33 +96,37 @@ static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride, static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_red, int histo[]) { - const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red)); - const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask - const __m128i mask = _mm_set1_epi16(0xff); - - int y; - for (y = 0; y < tile_height; ++y) { - const uint32_t* const src = argb + y * stride; - int i, x; - for (x = 0; x + SPAN <= tile_width; x += SPAN) { - uint16_t values[SPAN]; - const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); - const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); - const __m128i g0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 - const __m128i g1 = _mm_and_si128(in1, mask_g); - const __m128i g = _mm_packus_epi32(g0, g1); // g 0 - const __m128i A0 = _mm_srli_epi32(in0, 16); // 0 0 | x r - const __m128i A1 = _mm_srli_epi32(in1, 16); - const __m128i A = _mm_packus_epi32(A0, A1); // x r - const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dr - const __m128i C = _mm_sub_epi8(A, B); // x r' - const __m128i D = _mm_and_si128(C, mask); // 0 r' - _mm_storeu_si128((__m128i*)values, D); - for (i = 0; i < SPAN; ++i) ++histo[values[i]]; + + const __m128i mult = MK_CST_16(0, CST_5b(green_to_red)); + const __m128i mask_g = _mm_set1_epi32(0x0000ff00); + if (tile_width >= 4) { + int y; + for (y = 0; y < tile_height; ++y) { + const uint32_t* const src = argb + y * stride; + const __m128i A1 = _mm_loadu_si128((const __m128i*)src); + const __m128i B1 = _mm_and_si128(A1, mask_g); + const __m128i C1 = _mm_madd_epi16(B1, mult); + __m128i D = _mm_sub_epi16(A1, C1); + int x; + for (x = 4; x + 4 <= tile_width; x += 4) { + const __m128i A2 = _mm_loadu_si128((const __m128i*)(src + x)); + __m128i B2, C2; + ++histo[_mm_extract_epi8(D, 2)]; + B2 = _mm_and_si128(A2, mask_g); + ++histo[_mm_extract_epi8(D, 6)]; + C2 = _mm_madd_epi16(B2, mult); + ++histo[_mm_extract_epi8(D, 10)]; + ++histo[_mm_extract_epi8(D, 14)]; + D = _mm_sub_epi16(A2, C2); + } + ++histo[_mm_extract_epi8(D, 2)]; + ++histo[_mm_extract_epi8(D, 6)]; + ++histo[_mm_extract_epi8(D, 10)]; + ++histo[_mm_extract_epi8(D, 14)]; } } { - const int left_over = tile_width & (SPAN - 1); + const int left_over = tile_width & 3; if (left_over > 0) { VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, green_to_red, @@ -130,6 +135,8 @@ static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride, } } +#undef MK_CST_16 + //------------------------------------------------------------------------------ // Entry point diff --git a/thirdparty/libwebp/src/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c index aef0cee1b3..3a0eb440db 100644 --- a/thirdparty/libwebp/src/dsp/lossless_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c @@ -18,7 +18,6 @@ #include "src/dsp/common_sse2.h" #include "src/dsp/lossless.h" #include "src/dsp/lossless_common.h" -#include <assert.h> #include <emmintrin.h> //------------------------------------------------------------------------------ diff --git a/thirdparty/libwebp/src/dsp/lossless_sse41.c b/thirdparty/libwebp/src/dsp/lossless_sse41.c new file mode 100644 index 0000000000..b0d6daa7fe --- /dev/null +++ b/thirdparty/libwebp/src/dsp/lossless_sse41.c @@ -0,0 +1,132 @@ +// Copyright 2021 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE41 variant of methods for lossless decoder + +#include "src/dsp/dsp.h" + +#if defined(WEBP_USE_SSE41) + +#include "src/dsp/common_sse41.h" +#include "src/dsp/lossless.h" +#include "src/dsp/lossless_common.h" + +//------------------------------------------------------------------------------ +// Color-space conversion functions + +static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, + const uint32_t* const src, + int num_pixels, uint32_t* dst) { +// sign-extended multiplying constants, pre-shifted by 5. +#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend + const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 | + (CST(green_to_blue_) & 0xffff)); + const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_)); +#undef CST + const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); + const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5, + -1, 9, -1, 9, -1, 13, -1, 13); + const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1, + -1, 10, -1, -1, -1, 14, -1, -1); + int i; + for (i = 0; i + 4 <= num_pixels; i += 4) { + const __m128i A = _mm_loadu_si128((const __m128i*)(src + i)); + const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0 + const __m128i C = _mm_mulhi_epi16(B, mults_rb); + const __m128i D = _mm_add_epi8(A, C); + const __m128i E = _mm_shuffle_epi8(D, perm2); + const __m128i F = _mm_mulhi_epi16(E, mults_b2); + const __m128i G = _mm_add_epi8(D, F); + const __m128i out = _mm_blendv_epi8(G, A, mask_ag); + _mm_storeu_si128((__m128i*)&dst[i], out); + } + // Fall-back to C-version for left-overs. + if (i != num_pixels) { + VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); + } +} + +//------------------------------------------------------------------------------ + +#define ARGB_TO_RGB_SSE41 do { \ + while (num_pixels >= 16) { \ + const __m128i in0 = _mm_loadu_si128(in + 0); \ + const __m128i in1 = _mm_loadu_si128(in + 1); \ + const __m128i in2 = _mm_loadu_si128(in + 2); \ + const __m128i in3 = _mm_loadu_si128(in + 3); \ + const __m128i a0 = _mm_shuffle_epi8(in0, perm0); \ + const __m128i a1 = _mm_shuffle_epi8(in1, perm1); \ + const __m128i a2 = _mm_shuffle_epi8(in2, perm2); \ + const __m128i a3 = _mm_shuffle_epi8(in3, perm3); \ + const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \ + const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \ + const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \ + _mm_storeu_si128(out + 0, b0); \ + _mm_storeu_si128(out + 1, b1); \ + _mm_storeu_si128(out + 2, b2); \ + in += 4; \ + out += 3; \ + num_pixels -= 16; \ + } \ +} while (0) + +static void ConvertBGRAToRGB_SSE41(const uint32_t* src, int num_pixels, + uint8_t* dst) { + const __m128i* in = (const __m128i*)src; + __m128i* out = (__m128i*)dst; + const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, + 8, 14, 13, 12, -1, -1, -1, -1); + const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); + const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); + const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); + + ARGB_TO_RGB_SSE41; + + // left-overs + if (num_pixels > 0) { + VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + } +} + +static void ConvertBGRAToBGR_SSE41(const uint32_t* src, + int num_pixels, uint8_t* dst) { + const __m128i* in = (const __m128i*)src; + __m128i* out = (__m128i*)dst; + const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, + 12, 13, 14, -1, -1, -1, -1); + const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39); + const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e); + const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93); + + ARGB_TO_RGB_SSE41; + + // left-overs + if (num_pixels > 0) { + VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out); + } +} + +#undef ARGB_TO_RGB_SSE41 + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8LDspInitSSE41(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) { + VP8LTransformColorInverse = TransformColorInverse_SSE41; + VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41; + VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41; +} + +#else // !WEBP_USE_SSE41 + +WEBP_DSP_INIT_STUB(VP8LDspInitSSE41) + +#endif // WEBP_USE_SSE41 diff --git a/thirdparty/libwebp/src/dsp/rescaler.c b/thirdparty/libwebp/src/dsp/rescaler.c index c5a01e82df..14620ce4f1 100644 --- a/thirdparty/libwebp/src/dsp/rescaler.c +++ b/thirdparty/libwebp/src/dsp/rescaler.c @@ -38,8 +38,9 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk, int x_out = channel; // simple bilinear interpolation int accum = wrk->x_add; - int left = src[x_in]; - int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left; + rescaler_t left = (rescaler_t)src[x_in]; + rescaler_t right = + (wrk->src_width > 1) ? (rescaler_t)src[x_in + x_stride] : left; x_in += x_stride; while (1) { wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum; @@ -50,7 +51,7 @@ void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk, left = right; x_in += x_stride; assert(x_in < wrk->src_width * x_stride); - right = src[x_in]; + right = (rescaler_t)src[x_in]; accum += wrk->x_add; } } @@ -213,7 +214,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) { WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C; if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPRescalerDspInitSSE2(); } @@ -235,7 +236,7 @@ WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { WebPRescalerDspInitNEON(); diff --git a/thirdparty/libwebp/src/dsp/ssim.c b/thirdparty/libwebp/src/dsp/ssim.c index 989ce8254c..f85c2e6e5b 100644 --- a/thirdparty/libwebp/src/dsp/ssim.c +++ b/thirdparty/libwebp/src/dsp/ssim.c @@ -150,7 +150,7 @@ WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) { #endif if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { VP8SSIMDspInitSSE2(); } diff --git a/thirdparty/libwebp/src/dsp/upsampling.c b/thirdparty/libwebp/src/dsp/upsampling.c index 9b60da5bbb..87f771f3eb 100644 --- a/thirdparty/libwebp/src/dsp/upsampling.c +++ b/thirdparty/libwebp/src/dsp/upsampling.c @@ -233,12 +233,12 @@ WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) { WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C; if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitYUV444ConvertersSSE2(); } #endif -#if defined(WEBP_USE_SSE41) +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { WebPInitYUV444ConvertersSSE41(); } @@ -278,12 +278,12 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) { // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitUpsamplersSSE2(); } #endif -#if defined(WEBP_USE_SSE41) +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { WebPInitUpsamplersSSE41(); } @@ -300,7 +300,7 @@ WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) { #endif } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { WebPInitUpsamplersNEON(); diff --git a/thirdparty/libwebp/src/dsp/yuv.c b/thirdparty/libwebp/src/dsp/yuv.c index 14e67fc28e..48466f8b11 100644 --- a/thirdparty/libwebp/src/dsp/yuv.c +++ b/thirdparty/libwebp/src/dsp/yuv.c @@ -90,16 +90,16 @@ WEBP_DSP_INIT_FUNC(WebPInitSamplers) { // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitSamplersSSE2(); } -#endif // WEBP_USE_SSE2 -#if defined(WEBP_USE_SSE41) +#endif // WEBP_HAVE_SSE2 +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { WebPInitSamplersSSE41(); } -#endif // WEBP_USE_SSE41 +#endif // WEBP_HAVE_SSE41 #if defined(WEBP_USE_MIPS32) if (VP8GetCPUInfo(kMIPS32)) { WebPInitSamplersMIPS32(); @@ -276,26 +276,26 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { #endif if (VP8GetCPUInfo != NULL) { -#if defined(WEBP_USE_SSE2) +#if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitConvertARGBToYUVSSE2(); WebPInitSharpYUVSSE2(); } -#endif // WEBP_USE_SSE2 -#if defined(WEBP_USE_SSE41) +#endif // WEBP_HAVE_SSE2 +#if defined(WEBP_HAVE_SSE41) if (VP8GetCPUInfo(kSSE4_1)) { WebPInitConvertARGBToYUVSSE41(); } -#endif // WEBP_USE_SSE41 +#endif // WEBP_HAVE_SSE41 } -#if defined(WEBP_USE_NEON) +#if defined(WEBP_HAVE_NEON) if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { WebPInitConvertARGBToYUVNEON(); WebPInitSharpYUVNEON(); } -#endif // WEBP_USE_NEON +#endif // WEBP_HAVE_NEON assert(WebPConvertARGBToY != NULL); assert(WebPConvertARGBToUV != NULL); |