diff options
Diffstat (limited to 'thirdparty/libwebp/src/dsp')
-rw-r--r-- | thirdparty/libwebp/src/dsp/alpha_processing_sse2.c | 12 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/alpha_processing_sse41.c | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/cpu.c | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/cpu.h | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/dec_sse2.c | 93 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/dec_sse41.c | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/enc_neon.c | 9 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/enc_sse2.c | 67 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless.c | 12 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc.c | 18 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc_sse2.c | 8 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_sse2.c | 88 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_sse41.c | 7 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/quant.h | 13 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/rescaler_sse2.c | 6 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/upsampling_sse2.c | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/yuv_sse2.c | 13 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/yuv_sse41.c | 6 |
18 files changed, 189 insertions, 173 deletions
diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c index a5f8c9f7c7..f0843d0feb 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c @@ -26,8 +26,8 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, uint32_t alpha_and = 0xff; int i, j; const __m128i zero = _mm_setzero_si128(); - const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB - const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); + const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00); // to preserve RGB + const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte @@ -106,8 +106,8 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; - const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha - const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); + const __m128i a_mask = _mm_set1_epi32(0xff); // to preserve alpha + const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte @@ -178,7 +178,7 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, int w, int h, int stride) { const __m128i zero = _mm_setzero_si128(); - const __m128i kMult = _mm_set1_epi16(0x8081u); + const __m128i kMult = _mm_set1_epi16((short)0x8081); const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); const int kSpan = 4; while (h-- > 0) { @@ -267,7 +267,7 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) { } static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) { - const __m128i m_color = _mm_set1_epi32(color); + const __m128i m_color = _mm_set1_epi32((int)color); const __m128i zero = _mm_setzero_si128(); int i = 0; for (; i + 8 <= length; i += 8) { diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c index cdf877ce49..1156ac3417 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c @@ -26,7 +26,7 @@ static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb, // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; - const __m128i all_0xff = _mm_set1_epi32(~0u); + const __m128i all_0xff = _mm_set1_epi32(~0); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte diff --git a/thirdparty/libwebp/src/dsp/cpu.c b/thirdparty/libwebp/src/dsp/cpu.c index a4ba7f2cb7..62de73f750 100644 --- a/thirdparty/libwebp/src/dsp/cpu.c +++ b/thirdparty/libwebp/src/dsp/cpu.c @@ -212,7 +212,7 @@ VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo; #elif defined(WEBP_HAVE_NEON) // In most cases this function doesn't check for NEON support (it's assumed by // the configuration), but enables turning off NEON at runtime, for testing -// purposes, by setting VP8DecGetCPUInfo = NULL. +// purposes, by setting VP8GetCPUInfo = NULL. static int armCPUInfo(CPUFeature feature) { if (feature != kNEON) return 0; #if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD) diff --git a/thirdparty/libwebp/src/dsp/cpu.h b/thirdparty/libwebp/src/dsp/cpu.h index 57a40d87d4..be80727c0d 100644 --- a/thirdparty/libwebp/src/dsp/cpu.h +++ b/thirdparty/libwebp/src/dsp/cpu.h @@ -14,6 +14,8 @@ #ifndef WEBP_DSP_CPU_H_ #define WEBP_DSP_CPU_H_ +#include <stddef.h> + #ifdef HAVE_CONFIG_H #include "src/webp/config.h" #endif diff --git a/thirdparty/libwebp/src/dsp/dec_sse2.c b/thirdparty/libwebp/src/dsp/dec_sse2.c index 873aa59e8a..01e6bcb636 100644 --- a/thirdparty/libwebp/src/dsp/dec_sse2.c +++ b/thirdparty/libwebp/src/dsp/dec_sse2.c @@ -158,10 +158,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS)); } else { // Load four bytes/pixels per line. - dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); - dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); - dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); - dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); + dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS)); + dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS)); + dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS)); + dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS)); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); @@ -187,10 +187,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3); } else { // Store four bytes/pixels per line. - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); } } } @@ -213,10 +213,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. - __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); - __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); - __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); - __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); + __m128i dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS)); + __m128i dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS)); + __m128i dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS)); + __m128i dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); @@ -233,10 +233,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); } #undef MUL #endif // USE_TRANSFORM_AC3 @@ -477,11 +477,11 @@ static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride, // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 const __m128i A0 = _mm_set_epi32( - WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]), - WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride])); + WebPMemToInt32(&b[6 * stride]), WebPMemToInt32(&b[2 * stride]), + WebPMemToInt32(&b[4 * stride]), WebPMemToInt32(&b[0 * stride])); const __m128i A1 = _mm_set_epi32( - WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]), - WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride])); + WebPMemToInt32(&b[7 * stride]), WebPMemToInt32(&b[3 * stride]), + WebPMemToInt32(&b[5 * stride]), WebPMemToInt32(&b[1 * stride])); // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 @@ -540,7 +540,7 @@ static WEBP_INLINE void Store4x4_SSE2(__m128i* const x, uint8_t* dst, int stride) { int i; for (i = 0; i < 4; ++i, dst += stride) { - WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x)); + WebPInt32ToMem(dst, _mm_cvtsi128_si32(*x)); *x = _mm_srli_si128(*x, 4); } } @@ -908,10 +908,10 @@ static void VE4_SSE2(uint8_t* dst) { // vertical const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); const __m128i b = _mm_subs_epu8(a, lsb); const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); - const uint32_t vals = _mm_cvtsi128_si32(avg); + const int vals = _mm_cvtsi128_si32(avg); int i; for (i = 0; i < 4; ++i) { - WebPUint32ToMem(dst + i * BPS, vals); + WebPInt32ToMem(dst + i * BPS, vals); } } @@ -925,10 +925,10 @@ static void LD4_SSE2(uint8_t* dst) { // Down-Left const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static void VR4_SSE2(uint8_t* dst) { // Vertical-Right @@ -946,10 +946,10 @@ static void VR4_SSE2(uint8_t* dst) { // Vertical-Right const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i efgh = _mm_avg_epu8(avg2, XABCD); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); // these two are hard to implement in SSE2, so we keep the C-version: DST(0, 2) = AVG3(J, I, X); @@ -970,11 +970,12 @@ static void VL4_SSE2(uint8_t* dst) { // Vertical-Left const __m128i abbc = _mm_or_si128(ab, bc); const __m128i lsb2 = _mm_and_si128(abbc, lsb1); const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); - const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); + const uint32_t extra_out = + (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); // these two are hard to get and irregular DST(3, 2) = (extra_out >> 0) & 0xff; @@ -990,7 +991,7 @@ static void RD4_SSE2(uint8_t* dst) { // Down-right const uint32_t K = dst[-1 + 2 * BPS]; const uint32_t L = dst[-1 + 3 * BPS]; const __m128i LKJI_____ = - _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24)); + _mm_cvtsi32_si128((int)(L | (K << 8) | (J << 16) | (I << 24))); const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD); const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1); const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2); @@ -998,10 +999,10 @@ static void RD4_SSE2(uint8_t* dst) { // Down-right const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } #undef DST @@ -1015,13 +1016,13 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) { const __m128i zero = _mm_setzero_si128(); int y; if (size == 4) { - const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); for (y = 0; y < 4; ++y, dst += BPS) { const int val = dst[-1] - top[-1]; const __m128i base = _mm_set1_epi16(val); const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); - WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); + WebPInt32ToMem(dst, _mm_cvtsi128_si32(out)); } } else if (size == 8) { const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); @@ -1062,7 +1063,7 @@ static void VE16_SSE2(uint8_t* dst) { static void HE16_SSE2(uint8_t* dst) { // horizontal int j; for (j = 16; j > 0; --j) { - const __m128i values = _mm_set1_epi8(dst[-1]); + const __m128i values = _mm_set1_epi8((char)dst[-1]); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; } @@ -1070,7 +1071,7 @@ static void HE16_SSE2(uint8_t* dst) { // horizontal static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 16; ++j) { _mm_storeu_si128((__m128i*)(dst + j * BPS), values); } @@ -1130,7 +1131,7 @@ static void VE8uv_SSE2(uint8_t* dst) { // vertical // helper for chroma-DC predictions static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 8; ++j) { _mm_storel_epi64((__m128i*)(dst + j * BPS), values); } diff --git a/thirdparty/libwebp/src/dsp/dec_sse41.c b/thirdparty/libwebp/src/dsp/dec_sse41.c index 8f18506d54..08a3630272 100644 --- a/thirdparty/libwebp/src/dsp/dec_sse41.c +++ b/thirdparty/libwebp/src/dsp/dec_sse41.c @@ -23,7 +23,7 @@ static void HE16_SSE41(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { - const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4)); + const __m128i in = _mm_cvtsi32_si128(WebPMemToInt32(dst - 4)); const __m128i values = _mm_shuffle_epi8(in, kShuffle3); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; diff --git a/thirdparty/libwebp/src/dsp/enc_neon.c b/thirdparty/libwebp/src/dsp/enc_neon.c index 601962ba76..3a04111c55 100644 --- a/thirdparty/libwebp/src/dsp/enc_neon.c +++ b/thirdparty/libwebp/src/dsp/enc_neon.c @@ -764,9 +764,14 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, // Horizontal sum of all four uint32_t values in 'sum'. static int SumToInt_NEON(uint32x4_t sum) { +#if defined(__aarch64__) + return (int)vaddvq_u32(sum); +#else const uint64x2_t sum2 = vpaddlq_u32(sum); - const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); - return (int)sum3; + const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)), + vreinterpret_u32_u64(vget_high_u64(sum2))); + return (int)vget_lane_u32(sum3, 0); +#endif } static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { diff --git a/thirdparty/libwebp/src/dsp/enc_sse2.c b/thirdparty/libwebp/src/dsp/enc_sse2.c index b2e78ed941..1d1055668f 100644 --- a/thirdparty/libwebp/src/dsp/enc_sse2.c +++ b/thirdparty/libwebp/src/dsp/enc_sse2.c @@ -156,10 +156,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. - ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS])); - ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS])); - ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS])); - ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS])); + ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS])); + ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS])); + ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS])); + ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS])); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); @@ -185,10 +185,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); } else { // Store four bytes/pixels per line. - WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); - WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); - WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); - WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); + WebPInt32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); + WebPInt32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); + WebPInt32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); + WebPInt32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); } } } @@ -481,7 +481,7 @@ static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, // helper for chroma-DC predictions static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 8; ++j) { _mm_storel_epi64((__m128i*)(dst + j * BPS), values); } @@ -489,7 +489,7 @@ static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 16; ++j) { _mm_store_si128((__m128i*)(dst + j * BPS), values); } @@ -540,7 +540,7 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { int j; for (j = 0; j < 8; ++j) { - const __m128i values = _mm_set1_epi8(left[j]); + const __m128i values = _mm_set1_epi8((char)left[j]); _mm_storel_epi64((__m128i*)dst, values); dst += BPS; } @@ -549,7 +549,7 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { int j; for (j = 0; j < 16; ++j) { - const __m128i values = _mm_set1_epi8(left[j]); + const __m128i values = _mm_set1_epi8((char)left[j]); _mm_store_si128((__m128i*)dst, values); dst += BPS; } @@ -722,10 +722,10 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); const __m128i b = _mm_subs_epu8(a, lsb); const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); - const uint32_t vals = _mm_cvtsi128_si32(avg); + const int vals = _mm_cvtsi128_si32(avg); int i; for (i = 0; i < 4; ++i) { - WebPUint32ToMem(dst + i * BPS, vals); + WebPInt32ToMem(dst + i * BPS, vals); } } @@ -760,10 +760,10 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static WEBP_INLINE void VR4_SSE2(uint8_t* dst, @@ -782,10 +782,10 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i efgh = _mm_avg_epu8(avg2, XABCD); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); // these two are hard to implement in SSE2, so we keep the C-version: DST(0, 2) = AVG3(J, I, X); @@ -807,11 +807,12 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst, const __m128i abbc = _mm_or_si128(ab, bc); const __m128i lsb2 = _mm_and_si128(abbc, lsb1); const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); - const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); + const uint32_t extra_out = + (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); // these two are hard to get and irregular DST(3, 2) = (extra_out >> 0) & 0xff; @@ -829,10 +830,10 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { @@ -875,14 +876,14 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { const __m128i zero = _mm_setzero_si128(); - const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); int y; for (y = 0; y < 4; ++y, dst += BPS) { const int val = top[-2 - y] - top[-1]; const __m128i base = _mm_set1_epi16(val); const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); - WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); + WebPInt32ToMem(dst, _mm_cvtsi128_si32(out)); } } diff --git a/thirdparty/libwebp/src/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c index 84a54296fd..fb86e58d4a 100644 --- a/thirdparty/libwebp/src/dsp/lossless.c +++ b/thirdparty/libwebp/src/dsp/lossless.c @@ -49,7 +49,7 @@ static WEBP_INLINE uint32_t Clip255(uint32_t a) { } static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) { - return Clip255(a + b - c); + return Clip255((uint32_t)(a + b - c)); } static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, @@ -66,7 +66,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, } static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) { - return Clip255(a + (a - b) / 2); + return Clip255((uint32_t)(a + (a - b) / 2)); } static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, @@ -293,10 +293,10 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, const uint32_t red = argb >> 16; int new_red = red & 0xff; int new_blue = argb & 0xff; - new_red += ColorTransformDelta(m->green_to_red_, green); + new_red += ColorTransformDelta((int8_t)m->green_to_red_, green); new_red &= 0xff; - new_blue += ColorTransformDelta(m->green_to_blue_, green); - new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red); + new_blue += ColorTransformDelta((int8_t)m->green_to_blue_, green); + new_blue += ColorTransformDelta((int8_t)m->red_to_blue_, (int8_t)new_red); new_blue &= 0xff; dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); } @@ -395,7 +395,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform, assert(row_start < row_end); assert(row_end <= transform->ysize_); switch (transform->type_) { - case SUBTRACT_GREEN: + case SUBTRACT_GREEN_TRANSFORM: VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out); break; case PREDICTOR_TRANSFORM: diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c index de6c4ace5f..b1f9f26d72 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc.c @@ -522,11 +522,11 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[], void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) { int i; for (i = 0; i < num_pixels; ++i) { - const int argb = argb_data[i]; + const int argb = (int)argb_data[i]; const int green = (argb >> 8) & 0xff; const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff; const uint32_t new_b = (((argb >> 0) & 0xff) - green) & 0xff; - argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b; + argb_data[i] = ((uint32_t)argb & 0xff00ff00u) | (new_r << 16) | new_b; } } @@ -547,10 +547,10 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data, const int8_t red = U32ToS8(argb >> 16); int new_red = red & 0xff; int new_blue = argb & 0xff; - new_red -= ColorTransformDelta(m->green_to_red_, green); + new_red -= ColorTransformDelta((int8_t)m->green_to_red_, green); new_red &= 0xff; - new_blue -= ColorTransformDelta(m->green_to_blue_, green); - new_blue -= ColorTransformDelta(m->red_to_blue_, red); + new_blue -= ColorTransformDelta((int8_t)m->green_to_blue_, green); + new_blue -= ColorTransformDelta((int8_t)m->red_to_blue_, red); new_blue &= 0xff; data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); } @@ -560,7 +560,7 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red, uint32_t argb) { const int8_t green = U32ToS8(argb >> 8); int new_red = argb >> 16; - new_red -= ColorTransformDelta(green_to_red, green); + new_red -= ColorTransformDelta((int8_t)green_to_red, green); return (new_red & 0xff); } @@ -569,9 +569,9 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue, uint32_t argb) { const int8_t green = U32ToS8(argb >> 8); const int8_t red = U32ToS8(argb >> 16); - uint8_t new_blue = argb & 0xff; - new_blue -= ColorTransformDelta(green_to_blue, green); - new_blue -= ColorTransformDelta(red_to_blue, red); + int new_blue = argb & 0xff; + new_blue -= ColorTransformDelta((int8_t)green_to_blue, green); + new_blue -= ColorTransformDelta((int8_t)red_to_blue, red); return (new_blue & 0xff); } diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c index 948001a3d5..66cbaab772 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c @@ -54,8 +54,8 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m, const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0); - const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks - const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks + const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks + const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb @@ -376,7 +376,7 @@ static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits, break; } case 2: { - const __m128i mask_or = _mm_set1_epi32(0xff000000); + const __m128i mask_or = _mm_set1_epi32((int)0xff000000); const __m128i mul_cst = _mm_set1_epi16(0x0104); const __m128i mask_mul = _mm_set1_epi16(0x0f00); for (x = 0; x + 16 <= width; x += 16, dst += 4) { @@ -427,7 +427,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0, static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - const __m128i black = _mm_set1_epi32(ARGB_BLACK); + const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i res = _mm_sub_epi8(src, black); diff --git a/thirdparty/libwebp/src/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c index 396cb0bdfc..4b6a532c23 100644 --- a/thirdparty/libwebp/src/dsp/lossless_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c @@ -27,23 +27,22 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); - const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); - const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); - const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); + const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero); + const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero); + const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero); const __m128i V1 = _mm_add_epi16(C0, C1); const __m128i V2 = _mm_sub_epi16(V1, C2); const __m128i b = _mm_packus_epi16(V2, V2); - const uint32_t output = _mm_cvtsi128_si32(b); - return output; + return (uint32_t)_mm_cvtsi128_si32(b); } static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); - const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); - const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); - const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); + const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero); + const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero); + const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero); const __m128i avg = _mm_add_epi16(C1, C0); const __m128i A0 = _mm_srli_epi16(avg, 1); const __m128i A1 = _mm_sub_epi16(A0, B0); @@ -52,16 +51,15 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0, const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(A0, A3); const __m128i A5 = _mm_packus_epi16(A4, A4); - const uint32_t output = _mm_cvtsi128_si32(A5); - return output; + return (uint32_t)_mm_cvtsi128_si32(A5); } static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) { int pa_minus_pb; const __m128i zero = _mm_setzero_si128(); - const __m128i A0 = _mm_cvtsi32_si128(a); - const __m128i B0 = _mm_cvtsi32_si128(b); - const __m128i C0 = _mm_cvtsi32_si128(c); + const __m128i A0 = _mm_cvtsi32_si128((int)a); + const __m128i B0 = _mm_cvtsi32_si128((int)b); + const __m128i C0 = _mm_cvtsi32_si128((int)c); const __m128i AC0 = _mm_subs_epu8(A0, C0); const __m128i CA0 = _mm_subs_epu8(C0, A0); const __m128i BC0 = _mm_subs_epu8(B0, C0); @@ -94,8 +92,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0, __m128i* const avg) { // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) const __m128i ones = _mm_set1_epi8(1); - const __m128i A0 = _mm_cvtsi32_si128(a0); - const __m128i A1 = _mm_cvtsi32_si128(a1); + const __m128i A0 = _mm_cvtsi32_si128((int)a0); + const __m128i A1 = _mm_cvtsi32_si128((int)a1); const __m128i avg1 = _mm_avg_epu8(A0, A1); const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones); *avg = _mm_sub_epi8(avg1, one); @@ -103,8 +101,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0, static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { const __m128i zero = _mm_setzero_si128(); - const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); - const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); + const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero); + const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero); const __m128i sum = _mm_add_epi16(A1, A0); return _mm_srli_epi16(sum, 1); } @@ -112,19 +110,18 @@ static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) { __m128i output; Average2_uint32_SSE2(a0, a1, &output); - return _mm_cvtsi128_si32(output); + return (uint32_t)_mm_cvtsi128_si32(output); } static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1, uint32_t a2) { const __m128i zero = _mm_setzero_si128(); const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2); - const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); + const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); const __m128i A2 = _mm_packus_epi16(avg2, avg2); - const uint32_t output = _mm_cvtsi128_si32(A2); - return output; + return (uint32_t)_mm_cvtsi128_si32(A2); } static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, @@ -134,8 +131,7 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, const __m128i sum = _mm_add_epi16(avg2, avg1); const __m128i avg3 = _mm_srli_epi16(sum, 1); const __m128i A0 = _mm_packus_epi16(avg3, avg3); - const uint32_t output = _mm_cvtsi128_si32(A0); - return output; + return (uint32_t)_mm_cvtsi128_si32(A0); } static uint32_t Predictor5_SSE2(const uint32_t* const left, @@ -192,7 +188,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left, static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - const __m128i black = _mm_set1_epi32(ARGB_BLACK); + const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i res = _mm_add_epi8(src, black); @@ -208,7 +204,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - __m128i prev = _mm_set1_epi32(out[-1]); + __m128i prev = _mm_set1_epi32((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { // a | b | c | d const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); @@ -285,12 +281,12 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) #undef GENERATE_PREDICTOR_2 // Predictor10: average of (average of (L,TL), average of (T, TR)). -#define DO_PRED10(OUT) do { \ - __m128i avgLTL, avg; \ - Average2_m128i(&L, &TL, &avgLTL); \ - Average2_m128i(&avgTTR, &avgLTL, &avg); \ - L = _mm_add_epi8(avg, src); \ - out[i + (OUT)] = _mm_cvtsi128_si32(L); \ +#define DO_PRED10(OUT) do { \ + __m128i avgLTL, avg; \ + Average2_m128i(&L, &TL, &avgLTL); \ + Average2_m128i(&avgTTR, &avgLTL, &avg); \ + L = _mm_add_epi8(avg, src); \ + out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \ } while (0) #define DO_PRED10_SHIFT do { \ @@ -303,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - __m128i L = _mm_cvtsi32_si128(out[-1]); + __m128i L = _mm_cvtsi32_si128((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); @@ -336,7 +332,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i B = _mm_andnot_si128(mask, T); \ const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \ L = _mm_add_epi8(src, pred); \ - out[i + (OUT)] = _mm_cvtsi128_si32(L); \ + out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \ } while (0) #define DO_PRED11_SHIFT do { \ @@ -351,7 +347,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; __m128i pa; - __m128i L = _mm_cvtsi32_si128(out[-1]); + __m128i L = _mm_cvtsi32_si128((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); @@ -384,12 +380,12 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, #undef DO_PRED11_SHIFT // Predictor12: ClampedAddSubtractFull. -#define DO_PRED12(DIFF, LANE, OUT) do { \ - const __m128i all = _mm_add_epi16(L, (DIFF)); \ - const __m128i alls = _mm_packus_epi16(all, all); \ - const __m128i res = _mm_add_epi8(src, alls); \ - out[i + (OUT)] = _mm_cvtsi128_si32(res); \ - L = _mm_unpacklo_epi8(res, zero); \ +#define DO_PRED12(DIFF, LANE, OUT) do { \ + const __m128i all = _mm_add_epi16(L, (DIFF)); \ + const __m128i alls = _mm_packus_epi16(all, all); \ + const __m128i res = _mm_add_epi8(src, alls); \ + out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \ + L = _mm_unpacklo_epi8(res, zero); \ } while (0) #define DO_PRED12_SHIFT(DIFF, LANE) do { \ @@ -402,7 +398,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i zero = _mm_setzero_si128(); - const __m128i L8 = _mm_cvtsi32_si128(out[-1]); + const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]); __m128i L = _mm_unpacklo_epi8(L8, zero); for (i = 0; i + 4 <= num_pixels; i += 4) { // Load 4 pixels at a time. @@ -468,7 +464,7 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); #undef MK_CST_16 #undef CST - const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks + const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb @@ -532,7 +528,7 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { - const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); + const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { @@ -561,7 +557,7 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); - const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); + const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { @@ -596,8 +592,8 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { - const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); - const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); + const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0); + const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; diff --git a/thirdparty/libwebp/src/dsp/lossless_sse41.c b/thirdparty/libwebp/src/dsp/lossless_sse41.c index b0d6daa7fe..bb7ce7611f 100644 --- a/thirdparty/libwebp/src/dsp/lossless_sse41.c +++ b/thirdparty/libwebp/src/dsp/lossless_sse41.c @@ -25,11 +25,12 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend - const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 | - (CST(green_to_blue_) & 0xffff)); + const __m128i mults_rb = + _mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 | + (CST(green_to_blue_) & 0xffff))); const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_)); #undef CST - const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); + const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5, -1, 9, -1, 9, -1, 13, -1, 13); const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1, diff --git a/thirdparty/libwebp/src/dsp/quant.h b/thirdparty/libwebp/src/dsp/quant.h index 5e8dba8d19..fc099bf9d6 100644 --- a/thirdparty/libwebp/src/dsp/quant.h +++ b/thirdparty/libwebp/src/dsp/quant.h @@ -21,10 +21,15 @@ #define IsFlat IsFlat_NEON -static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { +static uint32_t horizontal_add_uint32x4(const uint32x4_t a) { +#if defined(__aarch64__) + return vaddvq_u32(a); +#else const uint64x2_t b = vpaddlq_u32(a); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif } static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, @@ -45,7 +50,7 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, levels += 16; } - return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0); + return thresh >= (int)horizontal_add_uint32x4(sum); } #else diff --git a/thirdparty/libwebp/src/dsp/rescaler_sse2.c b/thirdparty/libwebp/src/dsp/rescaler_sse2.c index d7effea16e..3f18e94e93 100644 --- a/thirdparty/libwebp/src/dsp/rescaler_sse2.c +++ b/thirdparty/libwebp/src/dsp/rescaler_sse2.c @@ -85,7 +85,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); assert(sizeof(*frow) == sizeof(uint32_t)); - WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); + WebPInt32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); frow += 1; if (frow >= frow_end) break; accum -= wrk->x_sub; @@ -132,7 +132,7 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, __m128i base = zero; accum += wrk->x_add; while (accum > 0) { - const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); + const __m128i A = _mm_cvtsi32_si128(WebPMemToInt32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 @@ -198,7 +198,7 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0, const __m128i* const mult, uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); - const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); + const __m128i mask = _mm_set_epi32(~0, 0, ~0, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); const __m128i B1 = _mm_mul_epu32(*A1, *mult); const __m128i B2 = _mm_mul_epu32(*A2, *mult); diff --git a/thirdparty/libwebp/src/dsp/upsampling_sse2.c b/thirdparty/libwebp/src/dsp/upsampling_sse2.c index 340f1e2ac2..08b6d0b1cf 100644 --- a/thirdparty/libwebp/src/dsp/upsampling_sse2.c +++ b/thirdparty/libwebp/src/dsp/upsampling_sse2.c @@ -121,7 +121,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ - uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ + uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15); \ uint8_t* const r_v = r_u + 32; \ \ assert(top_y != NULL); \ diff --git a/thirdparty/libwebp/src/dsp/yuv_sse2.c b/thirdparty/libwebp/src/dsp/yuv_sse2.c index 970bbb7884..01a48f9af2 100644 --- a/thirdparty/libwebp/src/dsp/yuv_sse2.c +++ b/thirdparty/libwebp/src/dsp/yuv_sse2.c @@ -15,10 +15,12 @@ #if defined(WEBP_USE_SSE2) -#include "src/dsp/common_sse2.h" #include <stdlib.h> #include <emmintrin.h> +#include "src/dsp/common_sse2.h" +#include "src/utils/utils.h" + //----------------------------------------------------------------------------- // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. @@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) { // Load and replicate the U/V samples static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); - const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); + const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src)); const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples } @@ -130,7 +132,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, const __m128i rg0 = _mm_packus_epi16(*B, *A); const __m128i ba0 = _mm_packus_epi16(*R, *G); #endif - const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); + const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0); const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb... const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga... const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0); @@ -147,9 +149,10 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i b0 = _mm_packus_epi16(*B, *B); - const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8)); + const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8)); const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f)); - const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5); + const __m128i g1 = + _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5); const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3); const __m128i rg = _mm_or_si128(r1, g1); const __m128i gb = _mm_or_si128(g2, b1); diff --git a/thirdparty/libwebp/src/dsp/yuv_sse41.c b/thirdparty/libwebp/src/dsp/yuv_sse41.c index 579d1f7402..f79b802e47 100644 --- a/thirdparty/libwebp/src/dsp/yuv_sse41.c +++ b/thirdparty/libwebp/src/dsp/yuv_sse41.c @@ -15,10 +15,12 @@ #if defined(WEBP_USE_SSE41) -#include "src/dsp/common_sse41.h" #include <stdlib.h> #include <smmintrin.h> +#include "src/dsp/common_sse41.h" +#include "src/utils/utils.h" + //----------------------------------------------------------------------------- // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. @@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) { // Load and replicate the U/V samples static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); - const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); + const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src)); const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples } |