diff options
Diffstat (limited to 'thirdparty/libwebp/src/dsp')
-rw-r--r-- | thirdparty/libwebp/src/dsp/alpha_processing_neon.c | 6 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/cpu.c | 2 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/cpu.h | 254 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/dsp.h | 229 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless.h | 8 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc.c | 12 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc_mips32.c | 22 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/lossless_enc_sse2.c | 4 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/yuv.c | 64 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/yuv_neon.c | 108 | ||||
-rw-r--r-- | thirdparty/libwebp/src/dsp/yuv_sse2.c | 119 |
11 files changed, 283 insertions, 545 deletions
diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_neon.c b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c index 9e0ace9421..6716fb77f0 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_neon.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c @@ -83,7 +83,7 @@ static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first, static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha, int alpha_stride, int width, int height, uint8_t* WEBP_RESTRICT dst, int dst_stride) { - uint32_t alpha_mask = 0xffffffffu; + uint32_t alpha_mask = 0xffu; uint8x8_t mask8 = vdup_n_u8(0xff); uint32_t tmp[2]; int i, j; @@ -107,6 +107,7 @@ static int DispatchAlpha_NEON(const uint8_t* WEBP_RESTRICT alpha, dst += dst_stride; } vst1_u8((uint8_t*)tmp, mask8); + alpha_mask *= 0x01010101; alpha_mask &= tmp[0]; alpha_mask &= tmp[1]; return (alpha_mask != 0xffffffffu); @@ -135,7 +136,7 @@ static void DispatchAlphaToGreen_NEON(const uint8_t* WEBP_RESTRICT alpha, static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride, int width, int height, uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { - uint32_t alpha_mask = 0xffffffffu; + uint32_t alpha_mask = 0xffu; uint8x8_t mask8 = vdup_n_u8(0xff); uint32_t tmp[2]; int i, j; @@ -157,6 +158,7 @@ static int ExtractAlpha_NEON(const uint8_t* WEBP_RESTRICT argb, int argb_stride, alpha += alpha_stride; } vst1_u8((uint8_t*)tmp, mask8); + alpha_mask *= 0x01010101; alpha_mask &= tmp[0]; alpha_mask &= tmp[1]; return (alpha_mask == 0xffffffffu); diff --git a/thirdparty/libwebp/src/dsp/cpu.c b/thirdparty/libwebp/src/dsp/cpu.c index 3145e190a4..a4ba7f2cb7 100644 --- a/thirdparty/libwebp/src/dsp/cpu.c +++ b/thirdparty/libwebp/src/dsp/cpu.c @@ -11,7 +11,7 @@ // // Author: Christian Duvivier (cduvivier@google.com) -#include "src/dsp/dsp.h" +#include "src/dsp/cpu.h" #if defined(WEBP_HAVE_NEON_RTCD) #include <stdio.h> diff --git a/thirdparty/libwebp/src/dsp/cpu.h b/thirdparty/libwebp/src/dsp/cpu.h new file mode 100644 index 0000000000..57a40d87d4 --- /dev/null +++ b/thirdparty/libwebp/src/dsp/cpu.h @@ -0,0 +1,254 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// CPU detection functions and macros. +// +// Author: Skal (pascal.massimino@gmail.com) + +#ifndef WEBP_DSP_CPU_H_ +#define WEBP_DSP_CPU_H_ + +#ifdef HAVE_CONFIG_H +#include "src/webp/config.h" +#endif + +#include "src/webp/types.h" + +#if defined(__GNUC__) +#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) +#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) +#else +#define LOCAL_GCC_VERSION 0 +#define LOCAL_GCC_PREREQ(maj, min) 0 +#endif + +#if defined(__clang__) +#define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) +#define LOCAL_CLANG_PREREQ(maj, min) \ + (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) +#else +#define LOCAL_CLANG_VERSION 0 +#define LOCAL_CLANG_PREREQ(maj, min) 0 +#endif + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#if !defined(HAVE_CONFIG_H) +#if defined(_MSC_VER) && _MSC_VER > 1310 && \ + (defined(_M_X64) || defined(_M_IX86)) +#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets +#endif + +#if defined(_MSC_VER) && _MSC_VER >= 1500 && \ + (defined(_M_X64) || defined(_M_IX86)) +#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets +#endif +#endif + +// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp +// files without intrinsics, allowing the corresponding Init() to be called. +// Files containing intrinsics will need to be built targeting the instruction +// set so should succeed on one of the earlier tests. +#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2)) +#define WEBP_USE_SSE2 +#endif + +#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2) +#define WEBP_HAVE_SSE2 +#endif + +#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41)) +#define WEBP_USE_SSE41 +#endif + +#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41) +#define WEBP_HAVE_SSE41 +#endif + +#undef WEBP_MSC_SSE41 +#undef WEBP_MSC_SSE2 + +// The intrinsics currently cause compiler errors with arm-nacl-gcc and the +// inline assembly would need to be modified for use with Native Client. +#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \ + (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \ + !defined(__native_client__) +#define WEBP_USE_NEON +#endif + +#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \ + defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H) +#define WEBP_ANDROID_NEON // Android targets that may have NEON +#define WEBP_USE_NEON +#endif + +// Note: ARM64 is supported in Visual Studio 2017, but requires the direct +// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in +// arm_neon.h. Compile errors were seen with Visual Studio 2019 16.4 with +// vtbl4_u8(); a fix was made in 16.6. +#if defined(_MSC_VER) && ((_MSC_VER >= 1700 && defined(_M_ARM)) || \ + (_MSC_VER >= 1926 && defined(_M_ARM64))) +#define WEBP_USE_NEON +#define WEBP_USE_INTRINSICS +#endif + +#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) +#define WEBP_HAVE_NEON +#endif + +#if defined(__mips__) && !defined(__mips64) && defined(__mips_isa_rev) && \ + (__mips_isa_rev >= 1) && (__mips_isa_rev < 6) +#define WEBP_USE_MIPS32 +#if (__mips_isa_rev >= 2) +#define WEBP_USE_MIPS32_R2 +#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2) +#define WEBP_USE_MIPS_DSP_R2 +#endif +#endif +#endif + +#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) +#define WEBP_USE_MSA +#endif + +#ifndef WEBP_DSP_OMIT_C_CODE +#define WEBP_DSP_OMIT_C_CODE 1 +#endif + +#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE +#define WEBP_NEON_OMIT_C_CODE 1 +#else +#define WEBP_NEON_OMIT_C_CODE 0 +#endif + +#if !(LOCAL_CLANG_PREREQ(3, 8) || LOCAL_GCC_PREREQ(4, 8) || \ + defined(__aarch64__)) +#define WEBP_NEON_WORK_AROUND_GCC 1 +#else +#define WEBP_NEON_WORK_AROUND_GCC 0 +#endif + +// This macro prevents thread_sanitizer from reporting known concurrent writes. +#define WEBP_TSAN_IGNORE_FUNCTION +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#undef WEBP_TSAN_IGNORE_FUNCTION +#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread)) +#endif +#endif + +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#define WEBP_MSAN +#endif +#endif + +#if defined(WEBP_USE_THREAD) && !defined(_WIN32) +#include <pthread.h> // NOLINT + +#define WEBP_DSP_INIT(func) \ + do { \ + static volatile VP8CPUInfo func##_last_cpuinfo_used = \ + (VP8CPUInfo)&func##_last_cpuinfo_used; \ + static pthread_mutex_t func##_lock = PTHREAD_MUTEX_INITIALIZER; \ + if (pthread_mutex_lock(&func##_lock)) break; \ + if (func##_last_cpuinfo_used != VP8GetCPUInfo) func(); \ + func##_last_cpuinfo_used = VP8GetCPUInfo; \ + (void)pthread_mutex_unlock(&func##_lock); \ + } while (0) +#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32)) +#define WEBP_DSP_INIT(func) \ + do { \ + static volatile VP8CPUInfo func##_last_cpuinfo_used = \ + (VP8CPUInfo)&func##_last_cpuinfo_used; \ + if (func##_last_cpuinfo_used == VP8GetCPUInfo) break; \ + func(); \ + func##_last_cpuinfo_used = VP8GetCPUInfo; \ + } while (0) +#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32) + +// Defines an Init + helper function that control multiple initialization of +// function pointers / tables. +/* Usage: + WEBP_DSP_INIT_FUNC(InitFunc) { + ...function body + } +*/ +#define WEBP_DSP_INIT_FUNC(name) \ + static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void); \ + WEBP_TSAN_IGNORE_FUNCTION void name(void) { WEBP_DSP_INIT(name##_body); } \ + static WEBP_TSAN_IGNORE_FUNCTION void name##_body(void) + +#define WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#if defined(__clang__) && defined(__has_attribute) +#if __has_attribute(no_sanitize) +// This macro prevents the undefined behavior sanitizer from reporting +// failures. This is only meant to silence unaligned loads on platforms that +// are known to support them. +#undef WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNDEF __attribute__((no_sanitize("undefined"))) + +// This macro prevents the undefined behavior sanitizer from reporting +// failures related to unsigned integer overflows. This is only meant to +// silence cases where this well defined behavior is expected. +#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#endif +#endif + +// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'. +// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning. +#if !defined(WEBP_OFFSET_PTR) +#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off))) +#endif + +// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility) +#if !defined(WEBP_SWAP_16BIT_CSP) +#define WEBP_SWAP_16BIT_CSP 0 +#endif + +// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) +#if !defined(WORDS_BIGENDIAN) && \ + (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ + (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) +#define WORDS_BIGENDIAN +#endif + +typedef enum { + kSSE2, + kSSE3, + kSlowSSSE3, // special feature for slow SSSE3 architectures + kSSE4_1, + kAVX, + kAVX2, + kNEON, + kMIPS32, + kMIPSdspR2, + kMSA +} CPUFeature; + +#ifdef __cplusplus +extern "C" { +#endif + +// returns true if the CPU supports the feature. +typedef int (*VP8CPUInfo)(CPUFeature feature); +WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // WEBP_DSP_CPU_H_ diff --git a/thirdparty/libwebp/src/dsp/dsp.h b/thirdparty/libwebp/src/dsp/dsp.h index c4f57e4d5b..d2000b8efc 100644 --- a/thirdparty/libwebp/src/dsp/dsp.h +++ b/thirdparty/libwebp/src/dsp/dsp.h @@ -18,6 +18,7 @@ #include "src/webp/config.h" #endif +#include "src/dsp/cpu.h" #include "src/webp/types.h" #ifdef __cplusplus @@ -43,225 +44,6 @@ extern "C" { #define WEBP_RESTRICT #endif -//------------------------------------------------------------------------------ -// CPU detection - -#if defined(__GNUC__) -# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__) -# define LOCAL_GCC_PREREQ(maj, min) \ - (LOCAL_GCC_VERSION >= (((maj) << 8) | (min))) -#else -# define LOCAL_GCC_VERSION 0 -# define LOCAL_GCC_PREREQ(maj, min) 0 -#endif - -#if defined(__clang__) -# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__) -# define LOCAL_CLANG_PREREQ(maj, min) \ - (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min))) -#else -# define LOCAL_CLANG_VERSION 0 -# define LOCAL_CLANG_PREREQ(maj, min) 0 -#endif - -#ifndef __has_builtin -# define __has_builtin(x) 0 -#endif - -#if !defined(HAVE_CONFIG_H) -#if defined(_MSC_VER) && _MSC_VER > 1310 && \ - (defined(_M_X64) || defined(_M_IX86)) -#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets -#endif - -#if defined(_MSC_VER) && _MSC_VER >= 1500 && \ - (defined(_M_X64) || defined(_M_IX86)) -#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets -#endif -#endif - -// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp -// files without intrinsics, allowing the corresponding Init() to be called. -// Files containing intrinsics will need to be built targeting the instruction -// set so should succeed on one of the earlier tests. -#if (defined(__SSE2__) || defined(WEBP_MSC_SSE2)) && \ - (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE2)) -#define WEBP_USE_SSE2 -#endif - -#if defined(WEBP_USE_SSE2) && !defined(WEBP_HAVE_SSE2) -#define WEBP_HAVE_SSE2 -#endif - -#if (defined(__SSE4_1__) || defined(WEBP_MSC_SSE41)) && \ - (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_SSE41)) -#define WEBP_USE_SSE41 -#endif - -#if defined(WEBP_USE_SSE41) && !defined(WEBP_HAVE_SSE41) -#define WEBP_HAVE_SSE41 -#endif - -#undef WEBP_MSC_SSE41 -#undef WEBP_MSC_SSE2 - -// The intrinsics currently cause compiler errors with arm-nacl-gcc and the -// inline assembly would need to be modified for use with Native Client. -#if ((defined(__ARM_NEON__) || defined(__aarch64__)) && \ - (!defined(HAVE_CONFIG_H) || defined(WEBP_HAVE_NEON))) && \ - !defined(__native_client__) -#define WEBP_USE_NEON -#endif - -#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \ - defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H) -#define WEBP_ANDROID_NEON // Android targets that may have NEON -#define WEBP_USE_NEON -#endif - -// Note: ARM64 is supported in Visual Studio 2017, but requires the direct -// inclusion of arm64_neon.h; Visual Studio 2019 includes this file in -// arm_neon.h. -#if defined(_MSC_VER) && \ - ((_MSC_VER >= 1700 && defined(_M_ARM)) || \ - (_MSC_VER >= 1920 && defined(_M_ARM64))) -#define WEBP_USE_NEON -#define WEBP_USE_INTRINSICS -#endif - -#if defined(WEBP_USE_NEON) && !defined(WEBP_HAVE_NEON) -#define WEBP_HAVE_NEON -#endif - -#if defined(__mips__) && !defined(__mips64) && \ - defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6) -#define WEBP_USE_MIPS32 -#if (__mips_isa_rev >= 2) -#define WEBP_USE_MIPS32_R2 -#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2) -#define WEBP_USE_MIPS_DSP_R2 -#endif -#endif -#endif - -#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) -#define WEBP_USE_MSA -#endif - -#ifndef WEBP_DSP_OMIT_C_CODE -#define WEBP_DSP_OMIT_C_CODE 1 -#endif - -#if defined(WEBP_USE_NEON) && WEBP_DSP_OMIT_C_CODE -#define WEBP_NEON_OMIT_C_CODE 1 -#else -#define WEBP_NEON_OMIT_C_CODE 0 -#endif - -#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__)) -#define WEBP_NEON_WORK_AROUND_GCC 1 -#else -#define WEBP_NEON_WORK_AROUND_GCC 0 -#endif - -// This macro prevents thread_sanitizer from reporting known concurrent writes. -#define WEBP_TSAN_IGNORE_FUNCTION -#if defined(__has_feature) -#if __has_feature(thread_sanitizer) -#undef WEBP_TSAN_IGNORE_FUNCTION -#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread)) -#endif -#endif - -#if defined(WEBP_USE_THREAD) && !defined(_WIN32) -#include <pthread.h> // NOLINT - -#define WEBP_DSP_INIT(func) do { \ - static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ - (VP8CPUInfo)&func ## _last_cpuinfo_used; \ - static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \ - if (pthread_mutex_lock(&func ## _lock)) break; \ - if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func(); \ - func ## _last_cpuinfo_used = VP8GetCPUInfo; \ - (void)pthread_mutex_unlock(&func ## _lock); \ -} while (0) -#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32)) -#define WEBP_DSP_INIT(func) do { \ - static volatile VP8CPUInfo func ## _last_cpuinfo_used = \ - (VP8CPUInfo)&func ## _last_cpuinfo_used; \ - if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break; \ - func(); \ - func ## _last_cpuinfo_used = VP8GetCPUInfo; \ -} while (0) -#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32) - -// Defines an Init + helper function that control multiple initialization of -// function pointers / tables. -/* Usage: - WEBP_DSP_INIT_FUNC(InitFunc) { - ...function body - } -*/ -#define WEBP_DSP_INIT_FUNC(name) \ - static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \ - WEBP_TSAN_IGNORE_FUNCTION void name(void) { \ - WEBP_DSP_INIT(name ## _body); \ - } \ - static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void) - -#define WEBP_UBSAN_IGNORE_UNDEF -#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW -#if defined(__clang__) && defined(__has_attribute) -#if __has_attribute(no_sanitize) -// This macro prevents the undefined behavior sanitizer from reporting -// failures. This is only meant to silence unaligned loads on platforms that -// are known to support them. -#undef WEBP_UBSAN_IGNORE_UNDEF -#define WEBP_UBSAN_IGNORE_UNDEF \ - __attribute__((no_sanitize("undefined"))) - -// This macro prevents the undefined behavior sanitizer from reporting -// failures related to unsigned integer overflows. This is only meant to -// silence cases where this well defined behavior is expected. -#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW -#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \ - __attribute__((no_sanitize("unsigned-integer-overflow"))) -#endif -#endif - -// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'. -// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning. -#if !defined(WEBP_OFFSET_PTR) -#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off))) -#endif - -// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility) -#if !defined(WEBP_SWAP_16BIT_CSP) -#define WEBP_SWAP_16BIT_CSP 0 -#endif - -// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__) -#if !defined(WORDS_BIGENDIAN) && \ - (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \ - (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))) -#define WORDS_BIGENDIAN -#endif - -typedef enum { - kSSE2, - kSSE3, - kSlowSSSE3, // special feature for slow SSSE3 architectures - kSSE4_1, - kAVX, - kAVX2, - kNEON, - kMIPS32, - kMIPSdspR2, - kMSA -} CPUFeature; -// returns true if the CPU supports the feature. -typedef int (*VP8CPUInfo)(CPUFeature feature); -WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo; //------------------------------------------------------------------------------ // Init stub generator @@ -550,15 +332,6 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v, extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, uint8_t* u, uint8_t* v, int width); -// utilities for accurate RGB->YUV conversion -extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref, - uint16_t* dst, int len); -extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref, - int16_t* dst, int len); -extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, - int len, - const uint16_t* best_y, uint16_t* out); - // Must be called before using the above. void WebPInitConvertARGBToYUV(void); diff --git a/thirdparty/libwebp/src/dsp/lossless.h b/thirdparty/libwebp/src/dsp/lossless.h index c26c6bca07..de60d95d0b 100644 --- a/thirdparty/libwebp/src/dsp/lossless.h +++ b/thirdparty/libwebp/src/dsp/lossless.h @@ -182,9 +182,9 @@ extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16]; // ----------------------------------------------------------------------------- // Huffman-cost related functions. -typedef double (*VP8LCostFunc)(const uint32_t* population, int length); -typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, - int length); +typedef float (*VP8LCostFunc)(const uint32_t* population, int length); +typedef float (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y, + int length); typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256], const int Y[256]); @@ -198,7 +198,7 @@ typedef struct { // small struct to hold counters } VP8LStreaks; typedef struct { // small struct to hold bit entropy results - double entropy; // entropy + float entropy; // entropy uint32_t sum; // sum of the population int nonzeros; // number of non-zero elements in the population uint32_t max_val; // maximum value in the population diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c index 1580631e38..de6c4ace5f 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc.c @@ -402,7 +402,7 @@ static float FastLog2Slow_C(uint32_t v) { // Compute the combined Shanon's entropy for distribution {X} and {X+Y} static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) { int i; - double retval = 0.; + float retval = 0.f; int sumX = 0, sumXY = 0; for (i = 0; i < 256; ++i) { const int x = X[i]; @@ -418,7 +418,7 @@ static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) { } } retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); - return (float)retval; + return retval; } void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) { @@ -636,17 +636,17 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits, //------------------------------------------------------------------------------ -static double ExtraCost_C(const uint32_t* population, int length) { +static float ExtraCost_C(const uint32_t* population, int length) { int i; - double cost = 0.; + float cost = 0.f; for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2]; return cost; } -static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, +static float ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y, int length) { int i; - double cost = 0.; + float cost = 0.f; for (i = 2; i < length - 2; ++i) { const int xy = X[i + 2] + Y[i + 2]; cost += (i >> 1) * xy; diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c index 0412a093cf..639f786631 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c @@ -103,8 +103,8 @@ static float FastLog2Slow_MIPS32(uint32_t v) { // cost += i * *(pop + 1); // pop += 2; // } -// return (double)cost; -static double ExtraCost_MIPS32(const uint32_t* const population, int length) { +// return (float)cost; +static float ExtraCost_MIPS32(const uint32_t* const population, int length) { int i, temp0, temp1; const uint32_t* pop = &population[4]; const uint32_t* const LoopEnd = &population[length]; @@ -130,7 +130,7 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) { : "memory", "hi", "lo" ); - return (double)((int64_t)temp0 << 32 | temp1); + return (float)((int64_t)temp0 << 32 | temp1); } // C version of this function: @@ -148,9 +148,9 @@ static double ExtraCost_MIPS32(const uint32_t* const population, int length) { // pX += 2; // pY += 2; // } -// return (double)cost; -static double ExtraCostCombined_MIPS32(const uint32_t* const X, - const uint32_t* const Y, int length) { +// return (float)cost; +static float ExtraCostCombined_MIPS32(const uint32_t* const X, + const uint32_t* const Y, int length) { int i, temp0, temp1, temp2, temp3; const uint32_t* pX = &X[4]; const uint32_t* pY = &Y[4]; @@ -183,7 +183,7 @@ static double ExtraCostCombined_MIPS32(const uint32_t* const X, : "memory", "hi", "lo" ); - return (double)((int64_t)temp0 << 32 | temp1); + return (float)((int64_t)temp0 << 32 | temp1); } #define HUFFMAN_COST_PASS \ @@ -347,24 +347,24 @@ static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, uint32_t* pout, int size) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - const uint32_t end = ((size) / 4) * 4; + const int end = ((size) / 4) * 4; const uint32_t* const LoopEnd = pa + end; int i; ASM_START ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) ASM_END_0 - for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i]; + for (i = 0; i < size - end; ++i) pout[i] = pa[i] + pb[i]; } static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) { uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - const uint32_t end = ((size) / 4) * 4; + const int end = ((size) / 4) * 4; const uint32_t* const LoopEnd = pa + end; int i; ASM_START ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) ASM_END_1 - for (i = end; i < size; ++i) pout[i] += pa[i]; + for (i = 0; i < size - end; ++i) pout[i] += pa[i]; } #undef ASM_END_1 diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c index b2f83b871c..948001a3d5 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c @@ -239,7 +239,7 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) { static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) { int i; - double retval = 0.; + float retval = 0.f; int sumX = 0, sumXY = 0; const __m128i zero = _mm_setzero_si128(); @@ -273,7 +273,7 @@ static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) { } } retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY); - return (float)retval; + return retval; } #else diff --git a/thirdparty/libwebp/src/dsp/yuv.c b/thirdparty/libwebp/src/dsp/yuv.c index 48466f8b11..d16c13d3ca 100644 --- a/thirdparty/libwebp/src/dsp/yuv.c +++ b/thirdparty/libwebp/src/dsp/yuv.c @@ -194,50 +194,6 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb, //----------------------------------------------------------------------------- -#if !WEBP_NEON_OMIT_C_CODE -#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic -static uint16_t clip_y(int v) { - return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; -} - -static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src, - uint16_t* dst, int len) { - uint64_t diff = 0; - int i; - for (i = 0; i < len; ++i) { - const int diff_y = ref[i] - src[i]; - const int new_y = (int)dst[i] + diff_y; - dst[i] = clip_y(new_y); - diff += (uint64_t)abs(diff_y); - } - return diff; -} - -static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src, - int16_t* dst, int len) { - int i; - for (i = 0; i < len; ++i) { - const int diff_uv = ref[i] - src[i]; - dst[i] += diff_uv; - } -} - -static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out) { - int i; - for (i = 0; i < len; ++i, ++A, ++B) { - const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4; - const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4; - out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); - out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); - } -} -#endif // !WEBP_NEON_OMIT_C_CODE - -#undef MAX_Y - -//----------------------------------------------------------------------------- - void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width); void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width); void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb, @@ -247,18 +203,9 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width); void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v, int src_width, int do_store); -uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src, - uint16_t* dst, int len); -void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src, - int16_t* dst, int len); -void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out); - extern void WebPInitConvertARGBToYUVSSE2(void); extern void WebPInitConvertARGBToYUVSSE41(void); extern void WebPInitConvertARGBToYUVNEON(void); -extern void WebPInitSharpYUVSSE2(void); -extern void WebPInitSharpYUVNEON(void); WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { WebPConvertARGBToY = ConvertARGBToY_C; @@ -269,17 +216,10 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C; -#if !WEBP_NEON_OMIT_C_CODE - WebPSharpYUVUpdateY = SharpYUVUpdateY_C; - WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C; - WebPSharpYUVFilterRow = SharpYUVFilterRow_C; -#endif - if (VP8GetCPUInfo != NULL) { #if defined(WEBP_HAVE_SSE2) if (VP8GetCPUInfo(kSSE2)) { WebPInitConvertARGBToYUVSSE2(); - WebPInitSharpYUVSSE2(); } #endif // WEBP_HAVE_SSE2 #if defined(WEBP_HAVE_SSE41) @@ -293,7 +233,6 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { if (WEBP_NEON_OMIT_C_CODE || (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) { WebPInitConvertARGBToYUVNEON(); - WebPInitSharpYUVNEON(); } #endif // WEBP_HAVE_NEON @@ -302,7 +241,4 @@ WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) { assert(WebPConvertRGB24ToY != NULL); assert(WebPConvertBGR24ToY != NULL); assert(WebPConvertRGBA32ToUV != NULL); - assert(WebPSharpYUVUpdateY != NULL); - assert(WebPSharpYUVUpdateRGB != NULL); - assert(WebPSharpYUVFilterRow != NULL); } diff --git a/thirdparty/libwebp/src/dsp/yuv_neon.c b/thirdparty/libwebp/src/dsp/yuv_neon.c index a34d60248f..ff77b00980 100644 --- a/thirdparty/libwebp/src/dsp/yuv_neon.c +++ b/thirdparty/libwebp/src/dsp/yuv_neon.c @@ -173,116 +173,8 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) { WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON; } -//------------------------------------------------------------------------------ - -#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic -static uint16_t clip_y_NEON(int v) { - return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; -} - -static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, - uint16_t* dst, int len) { - int i; - const int16x8_t zero = vdupq_n_s16(0); - const int16x8_t max = vdupq_n_s16(MAX_Y); - uint64x2_t sum = vdupq_n_u64(0); - uint64_t diff; - - for (i = 0; i + 8 <= len; i += 8) { - const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); - const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); - const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); - const int16x8_t D = vsubq_s16(A, B); // diff_y - const int16x8_t F = vaddq_s16(C, D); // new_y - const uint16x8_t H = - vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); - const int16x8_t I = vabsq_s16(D); // abs(diff_y) - vst1q_u16(dst + i, H); - sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); - } - diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); - for (; i < len; ++i) { - const int diff_y = ref[i] - src[i]; - const int new_y = (int)(dst[i]) + diff_y; - dst[i] = clip_y_NEON(new_y); - diff += (uint64_t)(abs(diff_y)); - } - return diff; -} - -static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src, - int16_t* dst, int len) { - int i; - for (i = 0; i + 8 <= len; i += 8) { - const int16x8_t A = vld1q_s16(ref + i); - const int16x8_t B = vld1q_s16(src + i); - const int16x8_t C = vld1q_s16(dst + i); - const int16x8_t D = vsubq_s16(A, B); // diff_uv - const int16x8_t E = vaddq_s16(C, D); // new_uv - vst1q_s16(dst + i, E); - } - for (; i < len; ++i) { - const int diff_uv = ref[i] - src[i]; - dst[i] += diff_uv; - } -} - -static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out) { - int i; - const int16x8_t max = vdupq_n_s16(MAX_Y); - const int16x8_t zero = vdupq_n_s16(0); - for (i = 0; i + 8 <= len; i += 8) { - const int16x8_t a0 = vld1q_s16(A + i + 0); - const int16x8_t a1 = vld1q_s16(A + i + 1); - const int16x8_t b0 = vld1q_s16(B + i + 0); - const int16x8_t b1 = vld1q_s16(B + i + 1); - const int16x8_t a0b1 = vaddq_s16(a0, b1); - const int16x8_t a1b0 = vaddq_s16(a1, b0); - const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 - const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) - const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) - const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); - const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); - const int16x8_t d0 = vaddq_s16(c1, a0); - const int16x8_t d1 = vaddq_s16(c0, a1); - const int16x8_t e0 = vrshrq_n_s16(d0, 1); - const int16x8_t e1 = vrshrq_n_s16(d1, 1); - const int16x8x2_t f = vzipq_s16(e0, e1); - const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); - const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); - const int16x8_t h0 = vaddq_s16(g0, f.val[0]); - const int16x8_t h1 = vaddq_s16(g1, f.val[1]); - const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); - const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); - vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); - vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); - } - for (; i < len; ++i) { - const int a0b1 = A[i + 0] + B[i + 1]; - const int a1b0 = A[i + 1] + B[i + 0]; - const int a0a1b0b1 = a0b1 + a1b0 + 8; - const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; - const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; - out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0); - out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1); - } -} -#undef MAX_Y - -//------------------------------------------------------------------------------ - -extern void WebPInitSharpYUVNEON(void); - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) { - WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON; - WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON; - WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON; -} - #else // !WEBP_USE_NEON WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON) -WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON) #endif // WEBP_USE_NEON diff --git a/thirdparty/libwebp/src/dsp/yuv_sse2.c b/thirdparty/libwebp/src/dsp/yuv_sse2.c index baa48d5371..970bbb7884 100644 --- a/thirdparty/libwebp/src/dsp/yuv_sse2.c +++ b/thirdparty/libwebp/src/dsp/yuv_sse2.c @@ -747,128 +747,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2; } -//------------------------------------------------------------------------------ - -#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic -static uint16_t clip_y(int v) { - return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v; -} - -static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src, - uint16_t* dst, int len) { - uint64_t diff = 0; - uint32_t tmp[4]; - int i; - const __m128i zero = _mm_setzero_si128(); - const __m128i max = _mm_set1_epi16(MAX_Y); - const __m128i one = _mm_set1_epi16(1); - __m128i sum = zero; - - for (i = 0; i + 8 <= len; i += 8) { - const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); - const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); - const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); - const __m128i D = _mm_sub_epi16(A, B); // diff_y - const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0) - const __m128i F = _mm_add_epi16(C, D); // new_y - const __m128i G = _mm_or_si128(E, one); // -1 or 1 - const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero); - const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...)) - _mm_storeu_si128((__m128i*)(dst + i), H); - sum = _mm_add_epi32(sum, I); - } - _mm_storeu_si128((__m128i*)tmp, sum); - diff = tmp[3] + tmp[2] + tmp[1] + tmp[0]; - for (; i < len; ++i) { - const int diff_y = ref[i] - src[i]; - const int new_y = (int)dst[i] + diff_y; - dst[i] = clip_y(new_y); - diff += (uint64_t)abs(diff_y); - } - return diff; -} - -static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src, - int16_t* dst, int len) { - int i = 0; - for (i = 0; i + 8 <= len; i += 8) { - const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i)); - const __m128i B = _mm_loadu_si128((const __m128i*)(src + i)); - const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i)); - const __m128i D = _mm_sub_epi16(A, B); // diff_uv - const __m128i E = _mm_add_epi16(C, D); // new_uv - _mm_storeu_si128((__m128i*)(dst + i), E); - } - for (; i < len; ++i) { - const int diff_uv = ref[i] - src[i]; - dst[i] += diff_uv; - } -} - -static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len, - const uint16_t* best_y, uint16_t* out) { - int i; - const __m128i kCst8 = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(MAX_Y); - const __m128i zero = _mm_setzero_si128(); - for (i = 0; i + 8 <= len; i += 8) { - const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0)); - const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1)); - const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0)); - const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1)); - const __m128i a0b1 = _mm_add_epi16(a0, b1); - const __m128i a1b0 = _mm_add_epi16(a1, b0); - const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1 - const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8); - const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1) - const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0) - const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3); - const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3); - const __m128i d0 = _mm_add_epi16(c1, a0); - const __m128i d1 = _mm_add_epi16(c0, a1); - const __m128i e0 = _mm_srai_epi16(d0, 1); - const __m128i e1 = _mm_srai_epi16(d1, 1); - const __m128i f0 = _mm_unpacklo_epi16(e0, e1); - const __m128i f1 = _mm_unpackhi_epi16(e0, e1); - const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0)); - const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8)); - const __m128i h0 = _mm_add_epi16(g0, f0); - const __m128i h1 = _mm_add_epi16(g1, f1); - const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero); - const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero); - _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0); - _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1); - } - for (; i < len; ++i) { - // (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 = - // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4 - // We reuse the common sub-expressions. - const int a0b1 = A[i + 0] + B[i + 1]; - const int a1b0 = A[i + 1] + B[i + 0]; - const int a0a1b0b1 = a0b1 + a1b0 + 8; - const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; - const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; - out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); - out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); - } -} - -#undef MAX_Y - -//------------------------------------------------------------------------------ - -extern void WebPInitSharpYUVSSE2(void); - -WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) { - WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2; - WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2; - WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2; -} - #else // !WEBP_USE_SSE2 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) -WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2) #endif // WEBP_USE_SSE2 |