diff options
Diffstat (limited to 'thirdparty')
61 files changed, 668 insertions, 463 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md index 38001b8782..f883a3a6da 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -346,7 +346,7 @@ Files extracted from upstream source: ## libwebp - Upstream: https://chromium.googlesource.com/webm/libwebp/ -- Version: 1.2.4 (0d1f12546bd803099a60c070517a552483f3790e, 2022) +- Version: 1.3.0 (b557776962a3dcc985d83bd4ed94e1e2e50d0fa2, 2022) - License: BSD-3-Clause Files extracted from upstream source: @@ -376,7 +376,7 @@ File extracted from upstream release tarball: ## meshoptimizer - Upstream: https://github.com/zeux/meshoptimizer -- Version: git (ea4558d1c0f217f1d67ed7fe0b07896ece88ae18, 2022) +- Version: git (4a287848fd664ae1c3fc8e5e008560534ceeb526, 2022) - License: MIT Files extracted from upstream repository: diff --git a/thirdparty/enet/enet/godot_ext.h b/thirdparty/enet/enet/godot_ext.h index 648f3d2f24..06a621b790 100644 --- a/thirdparty/enet/enet/godot_ext.h +++ b/thirdparty/enet/enet/godot_ext.h @@ -11,8 +11,8 @@ */ ENET_API void enet_address_set_ip(ENetAddress * address, const uint8_t * ip, size_t size); -ENET_API int enet_host_dtls_server_setup (ENetHost *, void *, void *); -ENET_API int enet_host_dtls_client_setup (ENetHost *, void *, uint8_t, const char *); +ENET_API int enet_host_dtls_server_setup (ENetHost *, void *); +ENET_API int enet_host_dtls_client_setup (ENetHost *, const char *, void *); ENET_API void enet_host_refuse_new_connections (ENetHost *, int); #endif // __ENET_GODOT_EXT_H__ diff --git a/thirdparty/enet/godot.cpp b/thirdparty/enet/godot.cpp index 47298dcf6a..ea7f4957a2 100644 --- a/thirdparty/enet/godot.cpp +++ b/thirdparty/enet/godot.cpp @@ -164,16 +164,14 @@ class ENetDTLSClient : public ENetGodotSocket { bool connected = false; Ref<PacketPeerUDP> udp; Ref<PacketPeerDTLS> dtls; - bool verify = false; + Ref<TLSOptions> tls_options; String for_hostname; - Ref<X509Certificate> cert; IPAddress local_address; public: - ENetDTLSClient(ENetUDP *p_base, Ref<X509Certificate> p_cert, bool p_verify, String p_for_hostname) { - verify = p_verify; + ENetDTLSClient(ENetUDP *p_base, String p_for_hostname, Ref<TLSOptions> p_options) { for_hostname = p_for_hostname; - cert = p_cert; + tls_options = p_options; udp.instantiate(); dtls = Ref<PacketPeerDTLS>(PacketPeerDTLS::create()); if (p_base->bound) { @@ -205,7 +203,7 @@ public: Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IPAddress p_ip, uint16_t p_port) { if (!connected) { udp->connect_to_host(p_ip, p_port); - if (dtls->connect_to_peer(udp, verify, for_hostname, cert)) { + if (dtls->connect_to_peer(udp, for_hostname, tls_options)) { return FAILED; } connected = true; @@ -265,7 +263,7 @@ class ENetDTLSServer : public ENetGodotSocket { IPAddress local_address; public: - ENetDTLSServer(ENetUDP *p_base, Ref<CryptoKey> p_key, Ref<X509Certificate> p_cert) { + ENetDTLSServer(ENetUDP *p_base, Ref<TLSOptions> p_options) { udp_server.instantiate(); if (p_base->bound) { uint16_t port; @@ -274,7 +272,7 @@ public: bind(local_address, port); } server = Ref<DTLSServer>(DTLSServer::create()); - server->setup(p_key, p_cert); + server->setup(p_options); } ~ENetDTLSServer() { @@ -437,22 +435,22 @@ ENetSocket enet_socket_create(ENetSocketType type) { return socket; } -int enet_host_dtls_server_setup(ENetHost *host, void *p_key, void *p_cert) { +int enet_host_dtls_server_setup(ENetHost *host, void *p_options) { ENetGodotSocket *sock = (ENetGodotSocket *)host->socket; if (!sock->can_upgrade()) { return -1; } - host->socket = memnew(ENetDTLSServer((ENetUDP *)sock, Ref<CryptoKey>((CryptoKey *)p_key), Ref<X509Certificate>((X509Certificate *)p_cert))); + host->socket = memnew(ENetDTLSServer(static_cast<ENetUDP *>(sock), Ref<TLSOptions>(static_cast<TLSOptions *>(p_options)))); memdelete(sock); return 0; } -int enet_host_dtls_client_setup(ENetHost *host, void *p_cert, uint8_t p_verify, const char *p_for_hostname) { +int enet_host_dtls_client_setup(ENetHost *host, const char *p_for_hostname, void *p_options) { ENetGodotSocket *sock = (ENetGodotSocket *)host->socket; if (!sock->can_upgrade()) { return -1; } - host->socket = memnew(ENetDTLSClient((ENetUDP *)sock, Ref<X509Certificate>((X509Certificate *)p_cert), p_verify, String::utf8(p_for_hostname))); + host->socket = memnew(ENetDTLSClient(static_cast<ENetUDP *>(sock), String::utf8(p_for_hostname), Ref<TLSOptions>(static_cast<TLSOptions *>(p_options)))); memdelete(sock); return 0; } diff --git a/thirdparty/libwebp/AUTHORS b/thirdparty/libwebp/AUTHORS index 3efcbe25b6..2f0c537d1c 100644 --- a/thirdparty/libwebp/AUTHORS +++ b/thirdparty/libwebp/AUTHORS @@ -11,11 +11,13 @@ Contributors: - Djordje Pesut (djordje dot pesut at imgtec dot com) - Frank Barchard (fbarchard at google dot com) - Hui Su (huisu at google dot com) +- H. Vetinari (h dot vetinari at gmx dot com) - Ilya Kurdyukov (jpegqs at gmail dot com) - Ingvar Stepanyan (rreverser at google dot com) - James Zern (jzern at google dot com) - Jan Engelhardt (jengelh at medozas dot de) - Jehan (jehan at girinstud dot io) +- Jeremy Maitin-Shepard (jbms at google dot com) - Johann Koenig (johann dot koenig at duck dot com) - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com) - Jyrki Alakuijala (jyrki at google dot com) diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv.c b/thirdparty/libwebp/sharpyuv/sharpyuv.c index 8b3ab7216b..7de34fb0b2 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv.c +++ b/thirdparty/libwebp/sharpyuv/sharpyuv.c @@ -15,16 +15,22 @@ #include <assert.h> #include <limits.h> -#include <math.h> +#include <stddef.h> #include <stdlib.h> #include <string.h> #include "src/webp/types.h" -#include "src/dsp/cpu.h" +#include "sharpyuv/sharpyuv_cpu.h" #include "sharpyuv/sharpyuv_dsp.h" #include "sharpyuv/sharpyuv_gamma.h" //------------------------------------------------------------------------------ + +int SharpYuvGetVersion(void) { + return SHARPYUV_VERSION; +} + +//------------------------------------------------------------------------------ // Sharp RGB->YUV conversion static const int kNumIterations = 4; @@ -414,24 +420,45 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr, } #undef SAFE_ALLOC +#if defined(WEBP_USE_THREAD) && !defined(_WIN32) +#include <pthread.h> // NOLINT + +#define LOCK_ACCESS \ + static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \ + if (pthread_mutex_lock(&sharpyuv_lock)) return +#define UNLOCK_ACCESS_AND_RETURN \ + do { \ + (void)pthread_mutex_unlock(&sharpyuv_lock); \ + return; \ + } while (0) +#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32)) +#define LOCK_ACCESS do {} while (0) +#define UNLOCK_ACCESS_AND_RETURN return +#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32) + // Hidden exported init function. -// By default SharpYuvConvert calls it with NULL. If needed, users can declare -// it as extern and call it with a VP8CPUInfo function. -extern void SharpYuvInit(VP8CPUInfo cpu_info_func); +// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed, +// users can declare it as extern and call it with an alternate VP8CPUInfo +// function. +SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func); void SharpYuvInit(VP8CPUInfo cpu_info_func) { static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used = (VP8CPUInfo)&sharpyuv_last_cpuinfo_used; - const int initialized = - (sharpyuv_last_cpuinfo_used != (VP8CPUInfo)&sharpyuv_last_cpuinfo_used); - if (cpu_info_func == NULL && initialized) return; - if (sharpyuv_last_cpuinfo_used == cpu_info_func) return; - - SharpYuvInitDsp(cpu_info_func); - if (!initialized) { - SharpYuvInitGammaTables(); + LOCK_ACCESS; + // Only update SharpYuvGetCPUInfo when called from external code to avoid a + // race on reading the value in SharpYuvConvert(). + if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) { + SharpYuvGetCPUInfo = cpu_info_func; + } + if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) { + UNLOCK_ACCESS_AND_RETURN; } - sharpyuv_last_cpuinfo_used = cpu_info_func; + SharpYuvInitDsp(); + SharpYuvInitGammaTables(); + + sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo; + UNLOCK_ACCESS_AND_RETURN; } int SharpYuvConvert(const void* r_ptr, const void* g_ptr, @@ -467,7 +494,8 @@ int SharpYuvConvert(const void* r_ptr, const void* g_ptr, // Stride should be even for uint16_t buffers. return 0; } - SharpYuvInit(NULL); + // The address of the function pointer is used to avoid a read race. + SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo); // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the // rgb->yuv conversion matrix. diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv.h b/thirdparty/libwebp/sharpyuv/sharpyuv.h index 9386ea2185..181b20a0bc 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv.h +++ b/thirdparty/libwebp/sharpyuv/sharpyuv.h @@ -12,15 +12,31 @@ #ifndef WEBP_SHARPYUV_SHARPYUV_H_ #define WEBP_SHARPYUV_SHARPYUV_H_ -#include <inttypes.h> - #ifdef __cplusplus extern "C" { #endif +#ifndef SHARPYUV_EXTERN +#ifdef WEBP_EXTERN +#define SHARPYUV_EXTERN WEBP_EXTERN +#else +// This explicitly marks library functions and allows for changing the +// signature for e.g., Windows DLL builds. +#if defined(__GNUC__) && __GNUC__ >= 4 +#define SHARPYUV_EXTERN extern __attribute__((visibility("default"))) +#else +#if defined(_MSC_VER) && defined(WEBP_DLL) +#define SHARPYUV_EXTERN __declspec(dllexport) +#else +#define SHARPYUV_EXTERN extern +#endif /* _MSC_VER && WEBP_DLL */ +#endif /* __GNUC__ >= 4 */ +#endif /* WEBP_EXTERN */ +#endif /* SHARPYUV_EXTERN */ + // SharpYUV API version following the convention from semver.org #define SHARPYUV_VERSION_MAJOR 0 -#define SHARPYUV_VERSION_MINOR 1 +#define SHARPYUV_VERSION_MINOR 2 #define SHARPYUV_VERSION_PATCH 0 // Version as a uint32_t. The major number is the high 8 bits. // The minor number is the middle 8 bits. The patch number is the low 16 bits. @@ -30,6 +46,10 @@ extern "C" { SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \ SHARPYUV_VERSION_PATCH) +// Returns the library's version number, packed in hexadecimal. See +// SHARPYUV_VERSION. +SHARPYUV_EXTERN int SharpYuvGetVersion(void); + // RGB to YUV conversion matrix, in 16 bit fixed point. // y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3] // u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3] @@ -65,11 +85,13 @@ typedef struct { // adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they // should be multiples of 2. // width, height: width and height of the image in pixels -int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr, - int rgb_step, int rgb_stride, int rgb_bit_depth, - void* y_ptr, int y_stride, void* u_ptr, int u_stride, - void* v_ptr, int v_stride, int yuv_bit_depth, int width, - int height, const SharpYuvConversionMatrix* yuv_matrix); +SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr, + const void* b_ptr, int rgb_step, + int rgb_stride, int rgb_bit_depth, + void* y_ptr, int y_stride, void* u_ptr, + int u_stride, void* v_ptr, int v_stride, + int yuv_bit_depth, int width, int height, + const SharpYuvConversionMatrix* yuv_matrix); // TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422 // support (it's rarely used in practice, especially for images). diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.c b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.c new file mode 100644 index 0000000000..29425a0c49 --- /dev/null +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.c @@ -0,0 +1,14 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +#include "sharpyuv/sharpyuv_cpu.h" + +// Include src/dsp/cpu.c to create SharpYuvGetCPUInfo from VP8GetCPUInfo. The +// function pointer is renamed in sharpyuv_cpu.h. +#include "src/dsp/cpu.c" diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.h b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.h new file mode 100644 index 0000000000..176ca3eb16 --- /dev/null +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.h @@ -0,0 +1,22 @@ +// Copyright 2022 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_ +#define WEBP_SHARPYUV_SHARPYUV_CPU_H_ + +#include "sharpyuv/sharpyuv.h" + +// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds. +// SharpYuvInit() replaces the use of the function pointer. +#undef WEBP_EXTERN +#define WEBP_EXTERN extern +#define VP8GetCPUInfo SharpYuvGetCPUInfo +#include "src/dsp/cpu.h" + +#endif // WEBP_SHARPYUV_SHARPYUV_CPU_H_ diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c index 5334fa64fa..0ad22be945 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c @@ -13,7 +13,7 @@ #include <assert.h> #include <math.h> -#include <string.h> +#include <stddef.h> static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); } diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h index 63c99ef5cd..3214e3ac60 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h @@ -35,8 +35,9 @@ typedef struct { } SharpYuvColorSpace; // Fills in 'matrix' for the given YUVColorSpace. -void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space, - SharpYuvConversionMatrix* matrix); +SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix( + const SharpYuvColorSpace* yuv_color_space, + SharpYuvConversionMatrix* matrix); // Enums for precomputed conversion matrices. typedef enum { @@ -49,7 +50,7 @@ typedef enum { } SharpYuvMatrixType; // Returns a pointer to a matrix for one of the predefined colorspaces. -const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix( +SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix( SharpYuvMatrixType matrix_type); #ifdef __cplusplus diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c index 956fa7ce55..31c272c408 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c @@ -16,7 +16,7 @@ #include <assert.h> #include <stdlib.h> -#include "src/dsp/cpu.h" +#include "sharpyuv/sharpyuv_cpu.h" //----------------------------------------------------------------------------- @@ -75,23 +75,24 @@ void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len, extern void InitSharpYuvSSE2(void); extern void InitSharpYuvNEON(void); -void SharpYuvInitDsp(VP8CPUInfo cpu_info_func) { - (void)cpu_info_func; - +void SharpYuvInitDsp(void) { #if !WEBP_NEON_OMIT_C_CODE SharpYuvUpdateY = SharpYuvUpdateY_C; SharpYuvUpdateRGB = SharpYuvUpdateRGB_C; SharpYuvFilterRow = SharpYuvFilterRow_C; #endif + if (SharpYuvGetCPUInfo != NULL) { #if defined(WEBP_HAVE_SSE2) - if (cpu_info_func == NULL || cpu_info_func(kSSE2)) { - InitSharpYuvSSE2(); - } + if (SharpYuvGetCPUInfo(kSSE2)) { + InitSharpYuvSSE2(); + } #endif // WEBP_HAVE_SSE2 + } #if defined(WEBP_HAVE_NEON) - if (WEBP_NEON_OMIT_C_CODE || cpu_info_func == NULL || cpu_info_func(kNEON)) { + if (WEBP_NEON_OMIT_C_CODE || + (SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) { InitSharpYuvNEON(); } #endif // WEBP_HAVE_NEON diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h index e561d8d3d0..805fbadbf6 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h @@ -12,9 +12,8 @@ #ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_ #define WEBP_SHARPYUV_SHARPYUV_DSP_H_ -#include <stdint.h> - -#include "src/dsp/cpu.h" +#include "sharpyuv/sharpyuv_cpu.h" +#include "src/webp/types.h" extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref, uint16_t* dst, int len, int bit_depth); @@ -24,6 +23,6 @@ extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out, int bit_depth); -void SharpYuvInitDsp(VP8CPUInfo cpu_info_func); +void SharpYuvInitDsp(void); #endif // WEBP_SHARPYUV_SHARPYUV_DSP_H_ diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c index 05b5436f83..20ab2da6bc 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c @@ -13,7 +13,6 @@ #include <assert.h> #include <math.h> -#include <stdint.h> #include "src/webp/types.h" diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h index 2f1a3ff4a0..d13aff59e1 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h @@ -12,7 +12,7 @@ #ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_ #define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_ -#include <stdint.h> +#include "src/webp/types.h" #ifdef __cplusplus extern "C" { diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c b/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c index 5cf6aaffb0..5840914865 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c @@ -17,11 +17,6 @@ #include <assert.h> #include <stdlib.h> #include <arm_neon.h> -#endif - -extern void InitSharpYuvNEON(void); - -#if defined(WEBP_USE_NEON) static uint16_t clip_NEON(int v, int max) { return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; @@ -169,6 +164,8 @@ static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len, //------------------------------------------------------------------------------ +extern void InitSharpYuvNEON(void); + WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) { SharpYuvUpdateY = SharpYuvUpdateY_NEON; SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON; @@ -177,6 +174,8 @@ WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) { #else // !WEBP_USE_NEON +extern void InitSharpYuvNEON(void); + void InitSharpYuvNEON(void) {} #endif // WEBP_USE_NEON diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c b/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c index 1943873748..9744d1bb6c 100644 --- a/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c +++ b/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c @@ -16,11 +16,6 @@ #if defined(WEBP_USE_SSE2) #include <stdlib.h> #include <emmintrin.h> -#endif - -extern void InitSharpYuvSSE2(void); - -#if defined(WEBP_USE_SSE2) static uint16_t clip_SSE2(int v, int max) { return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v; @@ -199,6 +194,8 @@ WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) { } #else // !WEBP_USE_SSE2 +extern void InitSharpYuvSSE2(void); + void InitSharpYuvSSE2(void) {} #endif // WEBP_USE_SSE2 diff --git a/thirdparty/libwebp/src/dec/vp8i_dec.h b/thirdparty/libwebp/src/dec/vp8i_dec.h index 30c1bd3ef9..83791ecd25 100644 --- a/thirdparty/libwebp/src/dec/vp8i_dec.h +++ b/thirdparty/libwebp/src/dec/vp8i_dec.h @@ -31,8 +31,8 @@ extern "C" { // version numbers #define DEC_MAJ_VERSION 1 -#define DEC_MIN_VERSION 2 -#define DEC_REV_VERSION 4 +#define DEC_MIN_VERSION 3 +#define DEC_REV_VERSION 0 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline). // Constraints are: We need to store one 16x16 block of luma samples (y), diff --git a/thirdparty/libwebp/src/dec/vp8l_dec.c b/thirdparty/libwebp/src/dec/vp8l_dec.c index 1348055128..c0ea0181e5 100644 --- a/thirdparty/libwebp/src/dec/vp8l_dec.c +++ b/thirdparty/libwebp/src/dec/vp8l_dec.c @@ -1336,7 +1336,7 @@ static int ReadTransform(int* const xsize, int const* ysize, ok = ok && ExpandColorMap(num_colors, transform); break; } - case SUBTRACT_GREEN: + case SUBTRACT_GREEN_TRANSFORM: break; default: assert(0); // can't happen diff --git a/thirdparty/libwebp/src/dec/webp_dec.c b/thirdparty/libwebp/src/dec/webp_dec.c index 77a54c55d2..3f4f7bb659 100644 --- a/thirdparty/libwebp/src/dec/webp_dec.c +++ b/thirdparty/libwebp/src/dec/webp_dec.c @@ -179,7 +179,7 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data, return VP8_STATUS_BITSTREAM_ERROR; // Not a valid chunk size. } // For odd-sized chunk-payload, there's one byte padding at the end. - disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1; + disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1u; total_size += disk_chunk_size; // Check that total bytes skipped so far does not exceed riff_size. diff --git a/thirdparty/libwebp/src/demux/demux.c b/thirdparty/libwebp/src/demux/demux.c index 41387ec2d6..324e5eb993 100644 --- a/thirdparty/libwebp/src/demux/demux.c +++ b/thirdparty/libwebp/src/demux/demux.c @@ -24,8 +24,8 @@ #include "src/webp/format_constants.h" #define DMUX_MAJ_VERSION 1 -#define DMUX_MIN_VERSION 2 -#define DMUX_REV_VERSION 4 +#define DMUX_MIN_VERSION 3 +#define DMUX_REV_VERSION 0 typedef struct { size_t start_; // start location of the data diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c index a5f8c9f7c7..f0843d0feb 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c @@ -26,8 +26,8 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, uint32_t alpha_and = 0xff; int i, j; const __m128i zero = _mm_setzero_si128(); - const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB - const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); + const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00); // to preserve RGB + const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte @@ -106,8 +106,8 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; - const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha - const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); + const __m128i a_mask = _mm_set1_epi32(0xff); // to preserve alpha + const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte @@ -178,7 +178,7 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, int w, int h, int stride) { const __m128i zero = _mm_setzero_si128(); - const __m128i kMult = _mm_set1_epi16(0x8081u); + const __m128i kMult = _mm_set1_epi16((short)0x8081); const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); const int kSpan = 4; while (h-- > 0) { @@ -267,7 +267,7 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) { } static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) { - const __m128i m_color = _mm_set1_epi32(color); + const __m128i m_color = _mm_set1_epi32((int)color); const __m128i zero = _mm_setzero_si128(); int i = 0; for (; i + 8 <= length; i += 8) { diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c index cdf877ce49..1156ac3417 100644 --- a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c +++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c @@ -26,7 +26,7 @@ static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb, // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; - const __m128i all_0xff = _mm_set1_epi32(~0u); + const __m128i all_0xff = _mm_set1_epi32(~0); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte diff --git a/thirdparty/libwebp/src/dsp/cpu.c b/thirdparty/libwebp/src/dsp/cpu.c index a4ba7f2cb7..62de73f750 100644 --- a/thirdparty/libwebp/src/dsp/cpu.c +++ b/thirdparty/libwebp/src/dsp/cpu.c @@ -212,7 +212,7 @@ VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo; #elif defined(WEBP_HAVE_NEON) // In most cases this function doesn't check for NEON support (it's assumed by // the configuration), but enables turning off NEON at runtime, for testing -// purposes, by setting VP8DecGetCPUInfo = NULL. +// purposes, by setting VP8GetCPUInfo = NULL. static int armCPUInfo(CPUFeature feature) { if (feature != kNEON) return 0; #if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD) diff --git a/thirdparty/libwebp/src/dsp/cpu.h b/thirdparty/libwebp/src/dsp/cpu.h index 57a40d87d4..be80727c0d 100644 --- a/thirdparty/libwebp/src/dsp/cpu.h +++ b/thirdparty/libwebp/src/dsp/cpu.h @@ -14,6 +14,8 @@ #ifndef WEBP_DSP_CPU_H_ #define WEBP_DSP_CPU_H_ +#include <stddef.h> + #ifdef HAVE_CONFIG_H #include "src/webp/config.h" #endif diff --git a/thirdparty/libwebp/src/dsp/dec_sse2.c b/thirdparty/libwebp/src/dsp/dec_sse2.c index 873aa59e8a..01e6bcb636 100644 --- a/thirdparty/libwebp/src/dsp/dec_sse2.c +++ b/thirdparty/libwebp/src/dsp/dec_sse2.c @@ -158,10 +158,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS)); } else { // Load four bytes/pixels per line. - dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); - dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); - dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); - dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); + dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS)); + dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS)); + dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS)); + dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS)); } // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); @@ -187,10 +187,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) { _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3); } else { // Store four bytes/pixels per line. - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); } } } @@ -213,10 +213,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. - __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS)); - __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS)); - __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS)); - __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS)); + __m128i dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS)); + __m128i dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS)); + __m128i dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS)); + __m128i dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); @@ -233,10 +233,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) { dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0)); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1)); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2)); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3)); } #undef MUL #endif // USE_TRANSFORM_AC3 @@ -477,11 +477,11 @@ static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride, // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 const __m128i A0 = _mm_set_epi32( - WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]), - WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride])); + WebPMemToInt32(&b[6 * stride]), WebPMemToInt32(&b[2 * stride]), + WebPMemToInt32(&b[4 * stride]), WebPMemToInt32(&b[0 * stride])); const __m128i A1 = _mm_set_epi32( - WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]), - WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride])); + WebPMemToInt32(&b[7 * stride]), WebPMemToInt32(&b[3 * stride]), + WebPMemToInt32(&b[5 * stride]), WebPMemToInt32(&b[1 * stride])); // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 @@ -540,7 +540,7 @@ static WEBP_INLINE void Store4x4_SSE2(__m128i* const x, uint8_t* dst, int stride) { int i; for (i = 0; i < 4; ++i, dst += stride) { - WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x)); + WebPInt32ToMem(dst, _mm_cvtsi128_si32(*x)); *x = _mm_srli_si128(*x, 4); } } @@ -908,10 +908,10 @@ static void VE4_SSE2(uint8_t* dst) { // vertical const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); const __m128i b = _mm_subs_epu8(a, lsb); const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); - const uint32_t vals = _mm_cvtsi128_si32(avg); + const int vals = _mm_cvtsi128_si32(avg); int i; for (i = 0; i < 4; ++i) { - WebPUint32ToMem(dst + i * BPS, vals); + WebPInt32ToMem(dst + i * BPS, vals); } } @@ -925,10 +925,10 @@ static void LD4_SSE2(uint8_t* dst) { // Down-Left const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static void VR4_SSE2(uint8_t* dst) { // Vertical-Right @@ -946,10 +946,10 @@ static void VR4_SSE2(uint8_t* dst) { // Vertical-Right const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i efgh = _mm_avg_epu8(avg2, XABCD); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); // these two are hard to implement in SSE2, so we keep the C-version: DST(0, 2) = AVG3(J, I, X); @@ -970,11 +970,12 @@ static void VL4_SSE2(uint8_t* dst) { // Vertical-Left const __m128i abbc = _mm_or_si128(ab, bc); const __m128i lsb2 = _mm_and_si128(abbc, lsb1); const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); - const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); + const uint32_t extra_out = + (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); // these two are hard to get and irregular DST(3, 2) = (extra_out >> 0) & 0xff; @@ -990,7 +991,7 @@ static void RD4_SSE2(uint8_t* dst) { // Down-right const uint32_t K = dst[-1 + 2 * BPS]; const uint32_t L = dst[-1 + 3 * BPS]; const __m128i LKJI_____ = - _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24)); + _mm_cvtsi32_si128((int)(L | (K << 8) | (J << 16) | (I << 24))); const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD); const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1); const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2); @@ -998,10 +999,10 @@ static void RD4_SSE2(uint8_t* dst) { // Down-right const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } #undef DST @@ -1015,13 +1016,13 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) { const __m128i zero = _mm_setzero_si128(); int y; if (size == 4) { - const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); for (y = 0; y < 4; ++y, dst += BPS) { const int val = dst[-1] - top[-1]; const __m128i base = _mm_set1_epi16(val); const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); - WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); + WebPInt32ToMem(dst, _mm_cvtsi128_si32(out)); } } else if (size == 8) { const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); @@ -1062,7 +1063,7 @@ static void VE16_SSE2(uint8_t* dst) { static void HE16_SSE2(uint8_t* dst) { // horizontal int j; for (j = 16; j > 0; --j) { - const __m128i values = _mm_set1_epi8(dst[-1]); + const __m128i values = _mm_set1_epi8((char)dst[-1]); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; } @@ -1070,7 +1071,7 @@ static void HE16_SSE2(uint8_t* dst) { // horizontal static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 16; ++j) { _mm_storeu_si128((__m128i*)(dst + j * BPS), values); } @@ -1130,7 +1131,7 @@ static void VE8uv_SSE2(uint8_t* dst) { // vertical // helper for chroma-DC predictions static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 8; ++j) { _mm_storel_epi64((__m128i*)(dst + j * BPS), values); } diff --git a/thirdparty/libwebp/src/dsp/dec_sse41.c b/thirdparty/libwebp/src/dsp/dec_sse41.c index 8f18506d54..08a3630272 100644 --- a/thirdparty/libwebp/src/dsp/dec_sse41.c +++ b/thirdparty/libwebp/src/dsp/dec_sse41.c @@ -23,7 +23,7 @@ static void HE16_SSE41(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { - const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4)); + const __m128i in = _mm_cvtsi32_si128(WebPMemToInt32(dst - 4)); const __m128i values = _mm_shuffle_epi8(in, kShuffle3); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; diff --git a/thirdparty/libwebp/src/dsp/enc_neon.c b/thirdparty/libwebp/src/dsp/enc_neon.c index 601962ba76..3a04111c55 100644 --- a/thirdparty/libwebp/src/dsp/enc_neon.c +++ b/thirdparty/libwebp/src/dsp/enc_neon.c @@ -764,9 +764,14 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a, // Horizontal sum of all four uint32_t values in 'sum'. static int SumToInt_NEON(uint32x4_t sum) { +#if defined(__aarch64__) + return (int)vaddvq_u32(sum); +#else const uint64x2_t sum2 = vpaddlq_u32(sum); - const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); - return (int)sum3; + const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)), + vreinterpret_u32_u64(vget_high_u64(sum2))); + return (int)vget_lane_u32(sum3, 0); +#endif } static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) { diff --git a/thirdparty/libwebp/src/dsp/enc_sse2.c b/thirdparty/libwebp/src/dsp/enc_sse2.c index b2e78ed941..1d1055668f 100644 --- a/thirdparty/libwebp/src/dsp/enc_sse2.c +++ b/thirdparty/libwebp/src/dsp/enc_sse2.c @@ -156,10 +156,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. - ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS])); - ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS])); - ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS])); - ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS])); + ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS])); + ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS])); + ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS])); + ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS])); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); @@ -185,10 +185,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); } else { // Store four bytes/pixels per line. - WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); - WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); - WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); - WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); + WebPInt32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); + WebPInt32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); + WebPInt32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); + WebPInt32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); } } } @@ -481,7 +481,7 @@ static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred, // helper for chroma-DC predictions static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 8; ++j) { _mm_storel_epi64((__m128i*)(dst + j * BPS), values); } @@ -489,7 +489,7 @@ static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) { static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) { int j; - const __m128i values = _mm_set1_epi8(v); + const __m128i values = _mm_set1_epi8((char)v); for (j = 0; j < 16; ++j) { _mm_store_si128((__m128i*)(dst + j * BPS), values); } @@ -540,7 +540,7 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst, static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { int j; for (j = 0; j < 8; ++j) { - const __m128i values = _mm_set1_epi8(left[j]); + const __m128i values = _mm_set1_epi8((char)left[j]); _mm_storel_epi64((__m128i*)dst, values); dst += BPS; } @@ -549,7 +549,7 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) { static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) { int j; for (j = 0; j < 16; ++j) { - const __m128i values = _mm_set1_epi8(left[j]); + const __m128i values = _mm_set1_epi8((char)left[j]); _mm_store_si128((__m128i*)dst, values); dst += BPS; } @@ -722,10 +722,10 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); const __m128i b = _mm_subs_epu8(a, lsb); const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); - const uint32_t vals = _mm_cvtsi128_si32(avg); + const int vals = _mm_cvtsi128_si32(avg); int i; for (i = 0; i < 4; ++i) { - WebPUint32ToMem(dst + i * BPS, vals); + WebPInt32ToMem(dst + i * BPS, vals); } } @@ -760,10 +760,10 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static WEBP_INLINE void VR4_SSE2(uint8_t* dst, @@ -782,10 +782,10 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i efgh = _mm_avg_epu8(avg2, XABCD); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); // these two are hard to implement in SSE2, so we keep the C-version: DST(0, 2) = AVG3(J, I, X); @@ -807,11 +807,12 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst, const __m128i abbc = _mm_or_si128(ab, bc); const __m128i lsb2 = _mm_and_si128(abbc, lsb1); const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); - const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); + const uint32_t extra_out = + (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); // these two are hard to get and irregular DST(3, 2) = (extra_out >> 0) & 0xff; @@ -829,10 +830,10 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst, const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); const __m128i avg2 = _mm_subs_epu8(avg1, lsb); const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); - WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); - WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); - WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); - WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); + WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); + WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); + WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); + WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); } static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) { @@ -875,14 +876,14 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) { static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) { const __m128i zero = _mm_setzero_si128(); - const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); + const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top)); const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); int y; for (y = 0; y < 4; ++y, dst += BPS) { const int val = top[-2 - y] - top[-1]; const __m128i base = _mm_set1_epi16(val); const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); - WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); + WebPInt32ToMem(dst, _mm_cvtsi128_si32(out)); } } diff --git a/thirdparty/libwebp/src/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c index 84a54296fd..fb86e58d4a 100644 --- a/thirdparty/libwebp/src/dsp/lossless.c +++ b/thirdparty/libwebp/src/dsp/lossless.c @@ -49,7 +49,7 @@ static WEBP_INLINE uint32_t Clip255(uint32_t a) { } static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) { - return Clip255(a + b - c); + return Clip255((uint32_t)(a + b - c)); } static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, @@ -66,7 +66,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, } static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) { - return Clip255(a + (a - b) / 2); + return Clip255((uint32_t)(a + (a - b) / 2)); } static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, @@ -293,10 +293,10 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, const uint32_t red = argb >> 16; int new_red = red & 0xff; int new_blue = argb & 0xff; - new_red += ColorTransformDelta(m->green_to_red_, green); + new_red += ColorTransformDelta((int8_t)m->green_to_red_, green); new_red &= 0xff; - new_blue += ColorTransformDelta(m->green_to_blue_, green); - new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red); + new_blue += ColorTransformDelta((int8_t)m->green_to_blue_, green); + new_blue += ColorTransformDelta((int8_t)m->red_to_blue_, (int8_t)new_red); new_blue &= 0xff; dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); } @@ -395,7 +395,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform, assert(row_start < row_end); assert(row_end <= transform->ysize_); switch (transform->type_) { - case SUBTRACT_GREEN: + case SUBTRACT_GREEN_TRANSFORM: VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out); break; case PREDICTOR_TRANSFORM: diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c index de6c4ace5f..b1f9f26d72 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc.c @@ -522,11 +522,11 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[], void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) { int i; for (i = 0; i < num_pixels; ++i) { - const int argb = argb_data[i]; + const int argb = (int)argb_data[i]; const int green = (argb >> 8) & 0xff; const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff; const uint32_t new_b = (((argb >> 0) & 0xff) - green) & 0xff; - argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b; + argb_data[i] = ((uint32_t)argb & 0xff00ff00u) | (new_r << 16) | new_b; } } @@ -547,10 +547,10 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data, const int8_t red = U32ToS8(argb >> 16); int new_red = red & 0xff; int new_blue = argb & 0xff; - new_red -= ColorTransformDelta(m->green_to_red_, green); + new_red -= ColorTransformDelta((int8_t)m->green_to_red_, green); new_red &= 0xff; - new_blue -= ColorTransformDelta(m->green_to_blue_, green); - new_blue -= ColorTransformDelta(m->red_to_blue_, red); + new_blue -= ColorTransformDelta((int8_t)m->green_to_blue_, green); + new_blue -= ColorTransformDelta((int8_t)m->red_to_blue_, red); new_blue &= 0xff; data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue); } @@ -560,7 +560,7 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red, uint32_t argb) { const int8_t green = U32ToS8(argb >> 8); int new_red = argb >> 16; - new_red -= ColorTransformDelta(green_to_red, green); + new_red -= ColorTransformDelta((int8_t)green_to_red, green); return (new_red & 0xff); } @@ -569,9 +569,9 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue, uint32_t argb) { const int8_t green = U32ToS8(argb >> 8); const int8_t red = U32ToS8(argb >> 16); - uint8_t new_blue = argb & 0xff; - new_blue -= ColorTransformDelta(green_to_blue, green); - new_blue -= ColorTransformDelta(red_to_blue, red); + int new_blue = argb & 0xff; + new_blue -= ColorTransformDelta((int8_t)green_to_blue, green); + new_blue -= ColorTransformDelta((int8_t)red_to_blue, red); return (new_blue & 0xff); } diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c index 948001a3d5..66cbaab772 100644 --- a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c @@ -54,8 +54,8 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m, const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0); - const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks - const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks + const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks + const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb @@ -376,7 +376,7 @@ static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits, break; } case 2: { - const __m128i mask_or = _mm_set1_epi32(0xff000000); + const __m128i mask_or = _mm_set1_epi32((int)0xff000000); const __m128i mul_cst = _mm_set1_epi16(0x0104); const __m128i mask_mul = _mm_set1_epi16(0x0f00); for (x = 0; x + 16 <= width; x += 16, dst += 4) { @@ -427,7 +427,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0, static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - const __m128i black = _mm_set1_epi32(ARGB_BLACK); + const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i res = _mm_sub_epi8(src, black); diff --git a/thirdparty/libwebp/src/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c index 396cb0bdfc..4b6a532c23 100644 --- a/thirdparty/libwebp/src/dsp/lossless_sse2.c +++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c @@ -27,23 +27,22 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); - const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); - const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); - const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); + const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero); + const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero); + const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero); const __m128i V1 = _mm_add_epi16(C0, C1); const __m128i V2 = _mm_sub_epi16(V1, C2); const __m128i b = _mm_packus_epi16(V2, V2); - const uint32_t output = _mm_cvtsi128_si32(b); - return output; + return (uint32_t)_mm_cvtsi128_si32(b); } static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); - const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); - const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); - const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); + const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero); + const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero); + const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero); const __m128i avg = _mm_add_epi16(C1, C0); const __m128i A0 = _mm_srli_epi16(avg, 1); const __m128i A1 = _mm_sub_epi16(A0, B0); @@ -52,16 +51,15 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0, const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(A0, A3); const __m128i A5 = _mm_packus_epi16(A4, A4); - const uint32_t output = _mm_cvtsi128_si32(A5); - return output; + return (uint32_t)_mm_cvtsi128_si32(A5); } static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) { int pa_minus_pb; const __m128i zero = _mm_setzero_si128(); - const __m128i A0 = _mm_cvtsi32_si128(a); - const __m128i B0 = _mm_cvtsi32_si128(b); - const __m128i C0 = _mm_cvtsi32_si128(c); + const __m128i A0 = _mm_cvtsi32_si128((int)a); + const __m128i B0 = _mm_cvtsi32_si128((int)b); + const __m128i C0 = _mm_cvtsi32_si128((int)c); const __m128i AC0 = _mm_subs_epu8(A0, C0); const __m128i CA0 = _mm_subs_epu8(C0, A0); const __m128i BC0 = _mm_subs_epu8(B0, C0); @@ -94,8 +92,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0, __m128i* const avg) { // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) const __m128i ones = _mm_set1_epi8(1); - const __m128i A0 = _mm_cvtsi32_si128(a0); - const __m128i A1 = _mm_cvtsi32_si128(a1); + const __m128i A0 = _mm_cvtsi32_si128((int)a0); + const __m128i A1 = _mm_cvtsi32_si128((int)a1); const __m128i avg1 = _mm_avg_epu8(A0, A1); const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones); *avg = _mm_sub_epi8(avg1, one); @@ -103,8 +101,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0, static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { const __m128i zero = _mm_setzero_si128(); - const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); - const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); + const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero); + const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero); const __m128i sum = _mm_add_epi16(A1, A0); return _mm_srli_epi16(sum, 1); } @@ -112,19 +110,18 @@ static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) { __m128i output; Average2_uint32_SSE2(a0, a1, &output); - return _mm_cvtsi128_si32(output); + return (uint32_t)_mm_cvtsi128_si32(output); } static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1, uint32_t a2) { const __m128i zero = _mm_setzero_si128(); const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2); - const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); + const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); const __m128i A2 = _mm_packus_epi16(avg2, avg2); - const uint32_t output = _mm_cvtsi128_si32(A2); - return output; + return (uint32_t)_mm_cvtsi128_si32(A2); } static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, @@ -134,8 +131,7 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, const __m128i sum = _mm_add_epi16(avg2, avg1); const __m128i avg3 = _mm_srli_epi16(sum, 1); const __m128i A0 = _mm_packus_epi16(avg3, avg3); - const uint32_t output = _mm_cvtsi128_si32(A0); - return output; + return (uint32_t)_mm_cvtsi128_si32(A0); } static uint32_t Predictor5_SSE2(const uint32_t* const left, @@ -192,7 +188,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left, static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - const __m128i black = _mm_set1_epi32(ARGB_BLACK); + const __m128i black = _mm_set1_epi32((int)ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i res = _mm_add_epi8(src, black); @@ -208,7 +204,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - __m128i prev = _mm_set1_epi32(out[-1]); + __m128i prev = _mm_set1_epi32((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { // a | b | c | d const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); @@ -285,12 +281,12 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) #undef GENERATE_PREDICTOR_2 // Predictor10: average of (average of (L,TL), average of (T, TR)). -#define DO_PRED10(OUT) do { \ - __m128i avgLTL, avg; \ - Average2_m128i(&L, &TL, &avgLTL); \ - Average2_m128i(&avgTTR, &avgLTL, &avg); \ - L = _mm_add_epi8(avg, src); \ - out[i + (OUT)] = _mm_cvtsi128_si32(L); \ +#define DO_PRED10(OUT) do { \ + __m128i avgLTL, avg; \ + Average2_m128i(&L, &TL, &avgLTL); \ + Average2_m128i(&avgTTR, &avgLTL, &avg); \ + L = _mm_add_epi8(avg, src); \ + out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \ } while (0) #define DO_PRED10_SHIFT do { \ @@ -303,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1]) static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; - __m128i L = _mm_cvtsi32_si128(out[-1]); + __m128i L = _mm_cvtsi32_si128((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); @@ -336,7 +332,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, const __m128i B = _mm_andnot_si128(mask, T); \ const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \ L = _mm_add_epi8(src, pred); \ - out[i + (OUT)] = _mm_cvtsi128_si32(L); \ + out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L); \ } while (0) #define DO_PRED11_SHIFT do { \ @@ -351,7 +347,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; __m128i pa; - __m128i L = _mm_cvtsi32_si128(out[-1]); + __m128i L = _mm_cvtsi32_si128((int)out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); @@ -384,12 +380,12 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, #undef DO_PRED11_SHIFT // Predictor12: ClampedAddSubtractFull. -#define DO_PRED12(DIFF, LANE, OUT) do { \ - const __m128i all = _mm_add_epi16(L, (DIFF)); \ - const __m128i alls = _mm_packus_epi16(all, all); \ - const __m128i res = _mm_add_epi8(src, alls); \ - out[i + (OUT)] = _mm_cvtsi128_si32(res); \ - L = _mm_unpacklo_epi8(res, zero); \ +#define DO_PRED12(DIFF, LANE, OUT) do { \ + const __m128i all = _mm_add_epi16(L, (DIFF)); \ + const __m128i alls = _mm_packus_epi16(all, all); \ + const __m128i res = _mm_add_epi8(src, alls); \ + out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \ + L = _mm_unpacklo_epi8(res, zero); \ } while (0) #define DO_PRED12_SHIFT(DIFF, LANE) do { \ @@ -402,7 +398,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i zero = _mm_setzero_si128(); - const __m128i L8 = _mm_cvtsi32_si128(out[-1]); + const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]); __m128i L = _mm_unpacklo_epi8(L8, zero); for (i = 0; i + 4 <= num_pixels; i += 4) { // Load 4 pixels at a time. @@ -468,7 +464,7 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); #undef MK_CST_16 #undef CST - const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks + const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb @@ -532,7 +528,7 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels, static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { - const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu); + const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { @@ -561,7 +557,7 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src, static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); - const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); + const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { @@ -596,8 +592,8 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { - const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); - const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); + const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0); + const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; diff --git a/thirdparty/libwebp/src/dsp/lossless_sse41.c b/thirdparty/libwebp/src/dsp/lossless_sse41.c index b0d6daa7fe..bb7ce7611f 100644 --- a/thirdparty/libwebp/src/dsp/lossless_sse41.c +++ b/thirdparty/libwebp/src/dsp/lossless_sse41.c @@ -25,11 +25,12 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend - const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 | - (CST(green_to_blue_) & 0xffff)); + const __m128i mults_rb = + _mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 | + (CST(green_to_blue_) & 0xffff))); const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_)); #undef CST - const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); + const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00); const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5, -1, 9, -1, 9, -1, 13, -1, 13); const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1, diff --git a/thirdparty/libwebp/src/dsp/quant.h b/thirdparty/libwebp/src/dsp/quant.h index 5e8dba8d19..fc099bf9d6 100644 --- a/thirdparty/libwebp/src/dsp/quant.h +++ b/thirdparty/libwebp/src/dsp/quant.h @@ -21,10 +21,15 @@ #define IsFlat IsFlat_NEON -static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { +static uint32_t horizontal_add_uint32x4(const uint32x4_t a) { +#if defined(__aarch64__) + return vaddvq_u32(a); +#else const uint64x2_t b = vpaddlq_u32(a); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif } static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, @@ -45,7 +50,7 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks, levels += 16; } - return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0); + return thresh >= (int)horizontal_add_uint32x4(sum); } #else diff --git a/thirdparty/libwebp/src/dsp/rescaler_sse2.c b/thirdparty/libwebp/src/dsp/rescaler_sse2.c index d7effea16e..3f18e94e93 100644 --- a/thirdparty/libwebp/src/dsp/rescaler_sse2.c +++ b/thirdparty/libwebp/src/dsp/rescaler_sse2.c @@ -85,7 +85,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk, const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum); const __m128i out = _mm_madd_epi16(cur_pixels, mult); assert(sizeof(*frow) == sizeof(uint32_t)); - WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); + WebPInt32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out)); frow += 1; if (frow >= frow_end) break; accum -= wrk->x_sub; @@ -132,7 +132,7 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, __m128i base = zero; accum += wrk->x_add; while (accum > 0) { - const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); + const __m128i A = _mm_cvtsi32_si128(WebPMemToInt32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 @@ -198,7 +198,7 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0, const __m128i* const mult, uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); - const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); + const __m128i mask = _mm_set_epi32(~0, 0, ~0, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); const __m128i B1 = _mm_mul_epu32(*A1, *mult); const __m128i B2 = _mm_mul_epu32(*A2, *mult); diff --git a/thirdparty/libwebp/src/dsp/upsampling_sse2.c b/thirdparty/libwebp/src/dsp/upsampling_sse2.c index 340f1e2ac2..08b6d0b1cf 100644 --- a/thirdparty/libwebp/src/dsp/upsampling_sse2.c +++ b/thirdparty/libwebp/src/dsp/upsampling_sse2.c @@ -121,7 +121,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ int uv_pos, pos; \ /* 16byte-aligned array to cache reconstructed u and v */ \ uint8_t uv_buf[14 * 32 + 15] = { 0 }; \ - uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ + uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15); \ uint8_t* const r_v = r_u + 32; \ \ assert(top_y != NULL); \ diff --git a/thirdparty/libwebp/src/dsp/yuv_sse2.c b/thirdparty/libwebp/src/dsp/yuv_sse2.c index 970bbb7884..01a48f9af2 100644 --- a/thirdparty/libwebp/src/dsp/yuv_sse2.c +++ b/thirdparty/libwebp/src/dsp/yuv_sse2.c @@ -15,10 +15,12 @@ #if defined(WEBP_USE_SSE2) -#include "src/dsp/common_sse2.h" #include <stdlib.h> #include <emmintrin.h> +#include "src/dsp/common_sse2.h" +#include "src/utils/utils.h" + //----------------------------------------------------------------------------- // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. @@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) { // Load and replicate the U/V samples static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); - const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); + const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src)); const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples } @@ -130,7 +132,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R, const __m128i rg0 = _mm_packus_epi16(*B, *A); const __m128i ba0 = _mm_packus_epi16(*R, *G); #endif - const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); + const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0); const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb... const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga... const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0); @@ -147,9 +149,10 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, const __m128i r0 = _mm_packus_epi16(*R, *R); const __m128i g0 = _mm_packus_epi16(*G, *G); const __m128i b0 = _mm_packus_epi16(*B, *B); - const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8)); + const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8)); const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f)); - const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5); + const __m128i g1 = + _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5); const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3); const __m128i rg = _mm_or_si128(r1, g1); const __m128i gb = _mm_or_si128(g2, b1); diff --git a/thirdparty/libwebp/src/dsp/yuv_sse41.c b/thirdparty/libwebp/src/dsp/yuv_sse41.c index 579d1f7402..f79b802e47 100644 --- a/thirdparty/libwebp/src/dsp/yuv_sse41.c +++ b/thirdparty/libwebp/src/dsp/yuv_sse41.c @@ -15,10 +15,12 @@ #if defined(WEBP_USE_SSE41) -#include "src/dsp/common_sse41.h" #include <stdlib.h> #include <smmintrin.h> +#include "src/dsp/common_sse41.h" +#include "src/utils/utils.h" + //----------------------------------------------------------------------------- // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. @@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) { // Load and replicate the U/V samples static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) { const __m128i zero = _mm_setzero_si128(); - const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); + const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src)); const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples } diff --git a/thirdparty/libwebp/src/enc/analysis_enc.c b/thirdparty/libwebp/src/enc/analysis_enc.c index ebb784261c..a0001ac034 100644 --- a/thirdparty/libwebp/src/enc/analysis_enc.c +++ b/thirdparty/libwebp/src/enc/analysis_enc.c @@ -391,12 +391,14 @@ static int DoSegmentsJob(void* arg1, void* arg2) { return ok; } +#ifdef WEBP_USE_THREAD static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) { int i; for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i]; dst->alpha += src->alpha; dst->uv_alpha += src->uv_alpha; } +#endif // initialize the job struct with some tasks to perform static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job, @@ -425,10 +427,10 @@ int VP8EncAnalyze(VP8Encoder* const enc) { (enc->method_ <= 1); // for method 0 - 1, we need preds_[] to be filled. if (do_segments) { const int last_row = enc->mb_h_; - // We give a little more than a half work to the main thread. - const int split_row = (9 * last_row + 15) >> 4; const int total_mb = last_row * enc->mb_w_; #ifdef WEBP_USE_THREAD + // We give a little more than a half work to the main thread. + const int split_row = (9 * last_row + 15) >> 4; const int kMinSplitRow = 2; // minimal rows needed for mt to be worth it const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow); #else @@ -438,6 +440,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) { WebPGetWorkerInterface(); SegmentJob main_job; if (do_mt) { +#ifdef WEBP_USE_THREAD SegmentJob side_job; // Note the use of '&' instead of '&&' because we must call the functions // no matter what. @@ -455,6 +458,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) { } worker_interface->End(&side_job.worker); if (ok) MergeJobs(&side_job, &main_job); // merge results together +#endif // WEBP_USE_THREAD } else { // Even for single-thread case, we use the generic Worker tools. InitSegmentJob(enc, &main_job, 0, last_row); diff --git a/thirdparty/libwebp/src/enc/picture_csp_enc.c b/thirdparty/libwebp/src/enc/picture_csp_enc.c index fabebcf202..78c8ca479b 100644 --- a/thirdparty/libwebp/src/enc/picture_csp_enc.c +++ b/thirdparty/libwebp/src/enc/picture_csp_enc.c @@ -69,10 +69,12 @@ static int CheckNonOpaque(const uint8_t* alpha, int width, int height, int WebPPictureHasTransparency(const WebPPicture* picture) { if (picture == NULL) return 0; if (picture->use_argb) { - const int alpha_offset = ALPHA_OFFSET; - return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset, - picture->width, picture->height, - 4, picture->argb_stride * sizeof(*picture->argb)); + if (picture->argb != NULL) { + return CheckNonOpaque((const uint8_t*)picture->argb + ALPHA_OFFSET, + picture->width, picture->height, + 4, picture->argb_stride * sizeof(*picture->argb)); + } + return 0; } return CheckNonOpaque(picture->a, picture->width, picture->height, 1, picture->a_stride); @@ -170,21 +172,6 @@ static const int kMinDimensionIterativeConversion = 4; //------------------------------------------------------------------------------ // Main function -extern void SharpYuvInit(VP8CPUInfo cpu_info_func); - -static void SafeInitSharpYuv(void) { -#if defined(WEBP_USE_THREAD) && !defined(_WIN32) - static pthread_mutex_t initsharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; - if (pthread_mutex_lock(&initsharpyuv_lock)) return; -#endif - - SharpYuvInit(VP8GetCPUInfo); - -#if defined(WEBP_USE_THREAD) && !defined(_WIN32) - (void)pthread_mutex_unlock(&initsharpyuv_lock); -#endif -} - static int PreprocessARGB(const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr, @@ -481,6 +468,8 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb, } } +extern void SharpYuvInit(VP8CPUInfo cpu_info_func); + static int ImportYUVAFromRGBA(const uint8_t* r_ptr, const uint8_t* g_ptr, const uint8_t* b_ptr, @@ -516,7 +505,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr, } if (use_iterative_conversion) { - SafeInitSharpYuv(); + SharpYuvInit(VP8GetCPUInfo); if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) { return 0; } diff --git a/thirdparty/libwebp/src/enc/vp8i_enc.h b/thirdparty/libwebp/src/enc/vp8i_enc.h index 71f76702ae..c9927c47d8 100644 --- a/thirdparty/libwebp/src/enc/vp8i_enc.h +++ b/thirdparty/libwebp/src/enc/vp8i_enc.h @@ -31,8 +31,8 @@ extern "C" { // version numbers #define ENC_MAJ_VERSION 1 -#define ENC_MIN_VERSION 2 -#define ENC_REV_VERSION 4 +#define ENC_MIN_VERSION 3 +#define ENC_REV_VERSION 0 enum { MAX_LF_LEVELS = 64, // Maximum loop filter level MAX_VARIABLE_LEVEL = 67, // last (inclusive) level with variable cost diff --git a/thirdparty/libwebp/src/enc/vp8l_enc.c b/thirdparty/libwebp/src/enc/vp8l_enc.c index 2b345df610..0b07e529a9 100644 --- a/thirdparty/libwebp/src/enc/vp8l_enc.c +++ b/thirdparty/libwebp/src/enc/vp8l_enc.c @@ -361,10 +361,11 @@ typedef enum { kHistoTotal // Must be last. } HistoIx; -static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) { - const int green = p >> 8; // The upper bits are masked away later. - ++r[((p >> 16) - green) & 0xff]; - ++b[((p >> 0) - green) & 0xff]; +static void AddSingleSubGreen(uint32_t p, + uint32_t* const r, uint32_t* const b) { + const int green = (int)p >> 8; // The upper bits are masked away later. + ++r[(((int)p >> 16) - green) & 0xff]; + ++b[(((int)p >> 0) - green) & 0xff]; } static void AddSingle(uint32_t p, @@ -1354,7 +1355,7 @@ static int EncodeImageInternal( static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height, VP8LBitWriter* const bw) { VP8LPutBits(bw, TRANSFORM_PRESENT, 1); - VP8LPutBits(bw, SUBTRACT_GREEN, 2); + VP8LPutBits(bw, SUBTRACT_GREEN_TRANSFORM, 2); VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height); } diff --git a/thirdparty/libwebp/src/mux/muxi.h b/thirdparty/libwebp/src/mux/muxi.h index 0f4af1784d..7929138c44 100644 --- a/thirdparty/libwebp/src/mux/muxi.h +++ b/thirdparty/libwebp/src/mux/muxi.h @@ -28,8 +28,8 @@ extern "C" { // Defines and constants. #define MUX_MAJ_VERSION 1 -#define MUX_MIN_VERSION 2 -#define MUX_REV_VERSION 4 +#define MUX_MIN_VERSION 3 +#define MUX_REV_VERSION 0 // Chunk object. typedef struct WebPChunk WebPChunk; diff --git a/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h index 404b9a6d8c..24f3af7b54 100644 --- a/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h +++ b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h @@ -148,9 +148,9 @@ int VP8GetSigned(VP8BitReader* WEBP_RESTRICT const br, int v, const range_t value = (range_t)(br->value_ >> pos); const int32_t mask = (int32_t)(split - value) >> 31; // -1 or 0 br->bits_ -= 1; - br->range_ += mask; + br->range_ += (range_t)mask; br->range_ |= 1; - br->value_ -= (bit_t)((split + 1) & mask) << pos; + br->value_ -= (bit_t)((split + 1) & (uint32_t)mask) << pos; BT_TRACK(br); return (v ^ mask) - mask; } diff --git a/thirdparty/libwebp/src/utils/huffman_utils.c b/thirdparty/libwebp/src/utils/huffman_utils.c index 0cba0fbb7d..90c2fbf7c1 100644 --- a/thirdparty/libwebp/src/utils/huffman_utils.c +++ b/thirdparty/libwebp/src/utils/huffman_utils.c @@ -142,7 +142,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits, { int step; // step size to replicate values in current table - uint32_t low = -1; // low bits for current root entry + uint32_t low = 0xffffffffu; // low bits for current root entry uint32_t mask = total_size - 1; // mask for low bits uint32_t key = 0; // reversed prefix code int num_nodes = 1; // number of Huffman tree nodes diff --git a/thirdparty/libwebp/src/utils/utils.h b/thirdparty/libwebp/src/utils/utils.h index ef04f108fe..c5ee873357 100644 --- a/thirdparty/libwebp/src/utils/utils.h +++ b/thirdparty/libwebp/src/utils/utils.h @@ -64,7 +64,8 @@ WEBP_EXTERN void WebPSafeFree(void* const ptr); // Alignment #define WEBP_ALIGN_CST 31 -#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST) +#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & \ + ~(uintptr_t)WEBP_ALIGN_CST) #include <string.h> // memcpy() is the safe way of moving potentially unaligned 32b memory. @@ -73,10 +74,19 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) { memcpy(&A, ptr, sizeof(A)); return A; } + +static WEBP_INLINE int32_t WebPMemToInt32(const uint8_t* const ptr) { + return (int32_t)WebPMemToUint32(ptr); +} + static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) { memcpy(ptr, &val, sizeof(val)); } +static WEBP_INLINE void WebPInt32ToMem(uint8_t* const ptr, int val) { + WebPUint32ToMem(ptr, (uint32_t)val); +} + //------------------------------------------------------------------------------ // Reading/writing data. diff --git a/thirdparty/libwebp/src/webp/format_constants.h b/thirdparty/libwebp/src/webp/format_constants.h index eca6981a47..999035c5d2 100644 --- a/thirdparty/libwebp/src/webp/format_constants.h +++ b/thirdparty/libwebp/src/webp/format_constants.h @@ -55,7 +55,7 @@ typedef enum { PREDICTOR_TRANSFORM = 0, CROSS_COLOR_TRANSFORM = 1, - SUBTRACT_GREEN = 2, + SUBTRACT_GREEN_TRANSFORM = 2, COLOR_INDEXING_TRANSFORM = 3 } VP8LImageTransformType; diff --git a/thirdparty/libwebp/src/webp/types.h b/thirdparty/libwebp/src/webp/types.h index 47f7f2b007..f255432e41 100644 --- a/thirdparty/libwebp/src/webp/types.h +++ b/thirdparty/libwebp/src/webp/types.h @@ -42,7 +42,11 @@ typedef long long int int64_t; # if defined(__GNUC__) && __GNUC__ >= 4 # define WEBP_EXTERN extern __attribute__ ((visibility ("default"))) # else -# define WEBP_EXTERN extern +# if defined(_MSC_VER) && defined(WEBP_DLL) +# define WEBP_EXTERN __declspec(dllexport) +# else +# define WEBP_EXTERN extern +# endif # endif /* __GNUC__ >= 4 */ #endif /* WEBP_EXTERN */ diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md index 3c52415f62..b673c248b2 100644 --- a/thirdparty/meshoptimizer/LICENSE.md +++ b/thirdparty/meshoptimizer/LICENSE.md @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2016-2021 Arseny Kapoulkine +Copyright (c) 2016-2022 Arseny Kapoulkine Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp index b1f7b359c1..c4672ad606 100644 --- a/thirdparty/meshoptimizer/clusterizer.cpp +++ b/thirdparty/meshoptimizer/clusterizer.cpp @@ -283,6 +283,79 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int return result; } +static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra) +{ + unsigned int best_triangle = ~0u; + unsigned int best_extra = 5; + float best_score = FLT_MAX; + + for (size_t i = 0; i < meshlet.vertex_count; ++i) + { + unsigned int index = meshlet_vertices[meshlet.vertex_offset + i]; + + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; + + for (size_t j = 0; j < neighbors_size; ++j) + { + unsigned int triangle = neighbors[j]; + unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; + + unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff); + + // triangles that don't add new vertices to meshlets are max. priority + if (extra != 0) + { + // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets + if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) + extra = 0; + + extra++; + } + + // since topology-based priority is always more important than the score, we can skip scoring in some cases + if (extra > best_extra) + continue; + + float score = 0; + + // caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles) + if (meshlet_cone) + { + const Cone& tri_cone = triangles[triangle]; + + float distance2 = + (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) + + (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) + + (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz); + + float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz; + + score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius); + } + else + { + // each live_triangles entry is >= 1 since it includes the current triangle we're processing + score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3); + } + + // note that topology-based priority is always more important than the score + // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost + if (extra < best_extra || score < best_score) + { + best_triangle = triangle; + best_extra = extra; + best_score = score; + } + } + } + + if (out_extra) + *out_extra = best_extra; + + return best_triangle; +} + struct KDNode { union @@ -464,13 +537,15 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices); assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles); assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned + assert(cone_weight >= 0 && cone_weight <= 1); + meshopt_Allocator allocator; TriangleAdjacency2 adjacency = {}; @@ -511,65 +586,18 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve for (;;) { - unsigned int best_triangle = ~0u; - unsigned int best_extra = 5; - float best_score = FLT_MAX; - Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count); - for (size_t i = 0; i < meshlet.vertex_count; ++i) - { - unsigned int index = meshlet_vertices[meshlet.vertex_offset + i]; - - unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index]; - size_t neighbours_size = adjacency.counts[index]; - - for (size_t j = 0; j < neighbours_size; ++j) - { - unsigned int triangle = neighbours[j]; - assert(!emitted_flags[triangle]); - - unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; - assert(a < vertex_count && b < vertex_count && c < vertex_count); - - unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff); - - // triangles that don't add new vertices to meshlets are max. priority - if (extra != 0) - { - // artificially increase the priority of dangling triangles as they're expensive to add to new meshlets - if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1) - extra = 0; - - extra++; - } - - // since topology-based priority is always more important than the score, we can skip scoring in some cases - if (extra > best_extra) - continue; - - const Cone& tri_cone = triangles[triangle]; - - float distance2 = - (tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) + - (tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) + - (tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz); - - float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz; + unsigned int best_extra = 0; + unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra); - float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius); - - // note that topology-based priority is always more important than the score - // this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost - if (extra < best_extra || score < best_score) - { - best_triangle = triangle; - best_extra = extra; - best_score = score; - } - } + // if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring + if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles)) + { + best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL); } + // when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity if (best_triangle == ~0u) { float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz}; @@ -604,16 +632,16 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve { unsigned int index = indices[best_triangle * 3 + k]; - unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index]; - size_t neighbours_size = adjacency.counts[index]; + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; - for (size_t i = 0; i < neighbours_size; ++i) + for (size_t i = 0; i < neighbors_size; ++i) { - unsigned int tri = neighbours[i]; + unsigned int tri = neighbors[i]; if (tri == best_triangle) { - neighbours[i] = neighbours[neighbours_size - 1]; + neighbors[i] = neighbors[neighbors_size - 1]; adjacency.counts[index]--; break; } @@ -687,7 +715,7 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t assert(index_count % 3 == 0); assert(index_count / 3 <= kMeshletMaxTriangles); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); (void)vertex_count; @@ -839,7 +867,7 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices using namespace meshopt; assert(triangle_count <= kMeshletMaxTriangles); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); unsigned int indices[kMeshletMaxTriangles * 3]; diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp index f60db0dc4f..cad808a2b1 100644 --- a/thirdparty/meshoptimizer/indexgenerator.cpp +++ b/thirdparty/meshoptimizer/indexgenerator.cpp @@ -187,7 +187,7 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int using namespace meshopt; assert(indices || index_count == vertex_count); - assert(index_count % 3 == 0); + assert(!indices || index_count % 3 == 0); assert(vertex_size > 0 && vertex_size <= 256); meshopt_Allocator allocator; @@ -412,7 +412,7 @@ void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsig using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; @@ -483,7 +483,7 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h index 463fad29da..46d28d3ea3 100644 --- a/thirdparty/meshoptimizer/meshoptimizer.h +++ b/thirdparty/meshoptimizer/meshoptimizer.h @@ -1,7 +1,7 @@ /** - * meshoptimizer - version 0.17 + * meshoptimizer - version 0.18 * - * Copyright (C) 2016-2021, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Copyright (C) 2016-2022, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) * Report bugs and download new versions at https://github.com/zeux/meshoptimizer * * This library is distributed under the MIT License. See notice at the end of this file. @@ -12,7 +12,7 @@ #include <stddef.h> /* Version macro; major * 1000 + minor * 10 + patch */ -#define MESHOPTIMIZER_VERSION 170 /* 0.17 */ +#define MESHOPTIMIZER_VERSION 180 /* 0.18 */ /* If no API is defined, assume default */ #ifndef MESHOPTIMIZER_API @@ -37,8 +37,8 @@ extern "C" { #endif /** - * Vertex attribute stream, similar to glVertexPointer - * Each element takes size bytes, with stride controlling the spacing between successive elements. + * Vertex attribute stream + * Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size). */ struct meshopt_Stream { @@ -115,7 +115,7 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* dest * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering. * * destination must contain enough space for the resulting index buffer (index_count*2 elements) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); @@ -131,7 +131,7 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details. * * destination must contain enough space for the resulting index buffer (index_count*4 elements) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); @@ -171,7 +171,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination * * destination must contain enough space for the resulting index buffer (index_count elements) * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently */ MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold); @@ -313,7 +313,21 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data); /** - * Experimental: Mesh simplifier + * Simplification options + */ +enum +{ + /* Do not move vertices that are located on the topological border (vertices on triangle edges that don't have a paired triangle). Useful for simplifying portions of the larger mesh. */ + meshopt_SimplifyLockBorder = 1 << 0, +}; + +/** + * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex) + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count); + +/** + * Mesh simplifier * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification. @@ -322,16 +336,12 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer - * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] + * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); - -/** - * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex) - */ -MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count); +MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error); /** * Experimental: Mesh simplifier (sloppy) @@ -342,8 +352,8 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* d * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)! - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer - * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation + * vertex_positions should have float3 position in the first 12 bytes of each vertex + * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1] * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification */ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); @@ -356,17 +366,17 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. * * destination must contain enough space for the target index buffer (target_vertex_count elements) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count); /** - * Experimental: Returns the error scaling factor used by the simplifier to convert between absolute and relative extents + * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents * * Absolute error must be *divided* by the scaling factor before passing it to meshopt_simplify as target_error * Relative error returned by meshopt_simplify via result_error must be *multiplied* by the scaling factor to get absolute error. */ -MESHOPTIMIZER_EXPERIMENTAL float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_API float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); /** * Mesh stripifier @@ -418,7 +428,7 @@ struct meshopt_OverdrawStatistics * Returns overdraw statistics using a software rasterizer * Results may not match actual GPU performance * - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); @@ -456,7 +466,7 @@ struct meshopt_Meshlet * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3 - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512) * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency */ @@ -498,7 +508,7 @@ struct meshopt_Bounds * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable. * - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size) */ MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); @@ -518,7 +528,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destinati * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache. * * destination must contain enough space for the resulting index buffer (index_count elements) - * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * vertex_positions should have float3 position in the first 12 bytes of each vertex */ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); @@ -610,7 +620,7 @@ inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_s template <typename T> inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size); template <typename T> -inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = 0); +inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = 0); template <typename T> inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = 0); template <typename T> @@ -945,12 +955,12 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const } template <typename T> -inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error) +inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error) { meshopt_IndexAdapter<T> in(0, indices, index_count); meshopt_IndexAdapter<T> out(destination, 0, index_count); - return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error); + return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, options, result_error); } template <typename T> @@ -1039,7 +1049,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_ #endif /** - * Copyright (c) 2016-2021 Arseny Kapoulkine + * Copyright (c) 2016-2022 Arseny Kapoulkine * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation diff --git a/thirdparty/meshoptimizer/overdrawanalyzer.cpp b/thirdparty/meshoptimizer/overdrawanalyzer.cpp index 8d5859ba39..8b6f254134 100644 --- a/thirdparty/meshoptimizer/overdrawanalyzer.cpp +++ b/thirdparty/meshoptimizer/overdrawanalyzer.cpp @@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; diff --git a/thirdparty/meshoptimizer/overdrawoptimizer.cpp b/thirdparty/meshoptimizer/overdrawoptimizer.cpp index 143656ed76..cc22dbcffc 100644 --- a/thirdparty/meshoptimizer/overdrawoptimizer.cpp +++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp @@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; diff --git a/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch b/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch index 21daac6eec..5cac985dc5 100644 --- a/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch +++ b/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch @@ -1,5 +1,5 @@ diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp -index 5e92e2dc73..e40c141e76 100644 +index d8d4a67391..3847afc736 100644 --- a/thirdparty/meshoptimizer/simplifier.cpp +++ b/thirdparty/meshoptimizer/simplifier.cpp @@ -20,7 +20,7 @@ @@ -11,7 +11,7 @@ index 5e92e2dc73..e40c141e76 100644 // This work is based on: // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997 -@@ -453,6 +453,7 @@ struct Collapse +@@ -458,6 +458,7 @@ struct Collapse float error; unsigned int errorui; }; @@ -19,7 +19,7 @@ index 5e92e2dc73..e40c141e76 100644 }; static float normalize(Vector3& v) -@@ -533,6 +534,34 @@ static float quadricError(const Quadric& Q, const Vector3& v) +@@ -538,6 +539,34 @@ static float quadricError(const Quadric& Q, const Vector3& v) return fabsf(r) * s; } @@ -54,7 +54,7 @@ index 5e92e2dc73..e40c141e76 100644 static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w) { float aw = a * w; -@@ -688,7 +717,7 @@ static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3 +@@ -693,7 +722,7 @@ static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3 } #endif @@ -63,7 +63,7 @@ index 5e92e2dc73..e40c141e76 100644 { for (size_t i = 0; i < index_count; i += 3) { -@@ -698,6 +727,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic +@@ -703,6 +732,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic Quadric Q; quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f); @@ -73,7 +73,7 @@ index 5e92e2dc73..e40c141e76 100644 #if ATTRIBUTES quadricUpdateAttributes(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], Q.w); -@@ -708,7 +740,7 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic +@@ -713,7 +745,7 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic } } @@ -82,7 +82,7 @@ index 5e92e2dc73..e40c141e76 100644 { for (size_t i = 0; i < index_count; i += 3) { -@@ -752,6 +784,9 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic +@@ -757,6 +789,9 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic quadricAdd(vertex_quadrics[remap[i0]], Q); quadricAdd(vertex_quadrics[remap[i1]], Q); @@ -92,7 +92,7 @@ index 5e92e2dc73..e40c141e76 100644 } } } -@@ -856,7 +891,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices +@@ -861,7 +896,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices return collapse_count; } @@ -101,7 +101,7 @@ index 5e92e2dc73..e40c141e76 100644 { for (size_t i = 0; i < collapse_count; ++i) { -@@ -876,10 +911,14 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const +@@ -881,10 +916,14 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const float ei = quadricError(qi, vertex_positions[i1]); float ej = quadricError(qj, vertex_positions[j1]); @@ -116,7 +116,7 @@ index 5e92e2dc73..e40c141e76 100644 } } -@@ -976,7 +1015,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse +@@ -981,7 +1020,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse } } @@ -125,7 +125,7 @@ index 5e92e2dc73..e40c141e76 100644 { size_t edge_collapses = 0; size_t triangle_collapses = 0; -@@ -1038,6 +1077,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* +@@ -1043,6 +1082,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* assert(collapse_remap[r1] == r1); quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]); @@ -133,7 +133,7 @@ index 5e92e2dc73..e40c141e76 100644 if (vertex_kind[i0] == Kind_Complex) { -@@ -1075,7 +1115,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* +@@ -1080,7 +1120,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2; edge_collapses++; @@ -142,7 +142,7 @@ index 5e92e2dc73..e40c141e76 100644 } #if TRACE -@@ -1463,9 +1503,11 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned +@@ -1469,9 +1509,11 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count); memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric)); @@ -156,7 +156,7 @@ index 5e92e2dc73..e40c141e76 100644 if (result != indices) memcpy(result, indices, index_count * sizeof(unsigned int)); -@@ -1496,7 +1538,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned +@@ -1502,7 +1544,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned if (edge_collapse_count == 0) break; @@ -165,7 +165,7 @@ index 5e92e2dc73..e40c141e76 100644 #if TRACE > 1 dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind); -@@ -1515,7 +1557,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned +@@ -1521,7 +1563,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned printf("pass %d: ", int(pass_count++)); #endif diff --git a/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch b/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch index 33a17fe9fa..c065026a7d 100644 --- a/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch +++ b/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch @@ -1,21 +1,21 @@ diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h -index be4b765d97..463fad29da 100644 +index d95725dd71..46d28d3ea3 100644 --- a/thirdparty/meshoptimizer/meshoptimizer.h +++ b/thirdparty/meshoptimizer/meshoptimizer.h -@@ -328,6 +328,11 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_ - */ - MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error); +@@ -321,6 +321,11 @@ enum + meshopt_SimplifyLockBorder = 1 << 0, + }; +/** + * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex) + */ -+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count); ++MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count); + /** - * Experimental: Mesh simplifier (sloppy) - * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance + * Mesh simplifier + * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp -index a74b08a97d..5e92e2dc73 100644 +index 5f0e9bac31..797329b010 100644 --- a/thirdparty/meshoptimizer/simplifier.cpp +++ b/thirdparty/meshoptimizer/simplifier.cpp @@ -20,6 +20,8 @@ @@ -27,7 +27,7 @@ index a74b08a97d..5e92e2dc73 100644 // This work is based on: // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997 // Michael Garland. Quadric-based polygonal surface simplification. 1999 -@@ -371,6 +373,10 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned +@@ -376,6 +378,10 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned struct Vector3 { float x, y, z; @@ -38,7 +38,7 @@ index a74b08a97d..5e92e2dc73 100644 }; static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) -@@ -427,6 +433,13 @@ struct Quadric +@@ -432,6 +438,13 @@ struct Quadric float a10, a20, a21; float b0, b1, b2, c; float w; @@ -52,7 +52,7 @@ index a74b08a97d..5e92e2dc73 100644 }; struct Collapse -@@ -469,6 +482,16 @@ static void quadricAdd(Quadric& Q, const Quadric& R) +@@ -474,6 +487,16 @@ static void quadricAdd(Quadric& Q, const Quadric& R) Q.b2 += R.b2; Q.c += R.c; Q.w += R.w; @@ -69,7 +69,7 @@ index a74b08a97d..5e92e2dc73 100644 } static float quadricError(const Quadric& Q, const Vector3& v) -@@ -494,6 +517,17 @@ static float quadricError(const Quadric& Q, const Vector3& v) +@@ -499,6 +522,17 @@ static float quadricError(const Quadric& Q, const Vector3& v) r += ry * v.y; r += rz * v.z; @@ -87,7 +87,7 @@ index a74b08a97d..5e92e2dc73 100644 float s = Q.w == 0.f ? 0.f : 1.f / Q.w; return fabsf(r) * s; -@@ -517,6 +551,13 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo +@@ -522,6 +556,13 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo Q.b2 = c * dw; Q.c = d * dw; Q.w = w; @@ -101,7 +101,7 @@ index a74b08a97d..5e92e2dc73 100644 } static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w) -@@ -569,6 +610,84 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3 +@@ -574,6 +615,84 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3 quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight); } @@ -186,7 +186,7 @@ index a74b08a97d..5e92e2dc73 100644 static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap) { for (size_t i = 0; i < index_count; i += 3) -@@ -580,6 +699,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic +@@ -585,6 +704,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic Quadric Q; quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f); @@ -196,29 +196,30 @@ index a74b08a97d..5e92e2dc73 100644 quadricAdd(vertex_quadrics[remap[i0]], Q); quadricAdd(vertex_quadrics[remap[i1]], Q); quadricAdd(vertex_quadrics[remap[i2]], Q); -@@ -1273,13 +1395,19 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0; +@@ -1278,14 +1400,20 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0; #endif - size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error) + size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) +{ -+ return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, out_result_error, 0, 0, 0); ++ return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, options, out_result_error, 0, 0, 0); +} + -+size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count) ++size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count) { using namespace meshopt; assert(index_count % 3 == 0); -- assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); +- assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); - assert(vertex_positions_stride % sizeof(float) == 0); -+ assert(vertex_stride > 0 && vertex_stride <= 256); ++ assert(vertex_stride >= 12 && vertex_stride <= 256); + assert(vertex_stride % sizeof(float) == 0); assert(target_index_count <= index_count); + assert((options & ~(meshopt_SimplifyLockBorder)) == 0); + assert(attribute_count <= ATTRIBUTES); meshopt_Allocator allocator; -@@ -1293,7 +1421,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, +@@ -1299,7 +1427,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, // build position remap that maps each vertex to the one with identical position unsigned int* remap = allocator.allocate<unsigned int>(vertex_count); unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count); @@ -227,7 +228,7 @@ index a74b08a97d..5e92e2dc73 100644 // classify vertices; vertex kind determines collapse rules, see kCanCollapse unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count); -@@ -1317,7 +1445,21 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, +@@ -1323,7 +1451,21 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, #endif Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count); @@ -250,7 +251,7 @@ index a74b08a97d..5e92e2dc73 100644 Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count); memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric)); -@@ -1409,7 +1551,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, +@@ -1415,7 +1557,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, // result_error is quadratic; we need to remap it back to linear if (out_result_error) diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp index e40c141e76..391a77861a 100644 --- a/thirdparty/meshoptimizer/simplifier.cpp +++ b/thirdparty/meshoptimizer/simplifier.cpp @@ -254,7 +254,7 @@ static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int return false; } -static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge) +static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, unsigned int options) { memset(loop, -1, vertex_count * sizeof(unsigned int)); memset(loopback, -1, vertex_count * sizeof(unsigned int)); @@ -364,6 +364,11 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned } } + if (options & meshopt_SimplifyLockBorder) + for (size_t i = 0; i < vertex_count; ++i) + if (result[i] == Kind_Border) + result[i] = Kind_Locked; + #if TRACE printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n", int(stats[0]), int(stats[1]), int(stats[2]), int(stats[3])); @@ -1434,19 +1439,20 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = 0; MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0; #endif -size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error) +size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error) { - return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, out_result_error, 0, 0, 0); + return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, options, out_result_error, 0, 0, 0); } -size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count) +size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count) { using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_stride > 0 && vertex_stride <= 256); + assert(vertex_stride >= 12 && vertex_stride <= 256); assert(vertex_stride % sizeof(float) == 0); assert(target_index_count <= index_count); + assert((options & ~(meshopt_SimplifyLockBorder)) == 0); assert(attribute_count <= ATTRIBUTES); meshopt_Allocator allocator; @@ -1467,7 +1473,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count); unsigned int* loop = allocator.allocate<unsigned int>(vertex_count); unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count); - classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge); + classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge, options); #if TRACE size_t unique_positions = 0; @@ -1605,7 +1611,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); assert(target_index_count <= index_count); @@ -1736,7 +1742,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos { using namespace meshopt; - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); assert(target_vertex_count <= vertex_count); @@ -1848,7 +1854,7 @@ float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, { using namespace meshopt; - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); float extent = rescalePositions(NULL, vertex_positions, vertex_count, vertex_positions_stride); diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp index b09f80ac6f..7b1a069450 100644 --- a/thirdparty/meshoptimizer/spatialorder.cpp +++ b/thirdparty/meshoptimizer/spatialorder.cpp @@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos { using namespace meshopt; - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); meshopt_Allocator allocator; @@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* using namespace meshopt; assert(index_count % 3 == 0); - assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256); assert(vertex_positions_stride % sizeof(float) == 0); (void)vertex_count; diff --git a/thirdparty/meshoptimizer/vcacheoptimizer.cpp b/thirdparty/meshoptimizer/vcacheoptimizer.cpp index fb8ade4b77..ce8fd3a887 100644 --- a/thirdparty/meshoptimizer/vcacheoptimizer.cpp +++ b/thirdparty/meshoptimizer/vcacheoptimizer.cpp @@ -110,7 +110,7 @@ static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned return ~0u; } -static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size) +static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size) { unsigned int best_candidate = ~0u; int best_priority = -1; @@ -281,16 +281,16 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned { unsigned int index = indices[current_triangle * 3 + k]; - unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index]; - size_t neighbours_size = adjacency.counts[index]; + unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbors_size = adjacency.counts[index]; - for (size_t i = 0; i < neighbours_size; ++i) + for (size_t i = 0; i < neighbors_size; ++i) { - unsigned int tri = neighbours[i]; + unsigned int tri = neighbors[i]; if (tri == current_triangle) { - neighbours[i] = neighbours[neighbours_size - 1]; + neighbors[i] = neighbors[neighbors_size - 1]; adjacency.counts[index]--; break; } @@ -314,10 +314,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned vertex_scores[index] = score; // update scores of vertex triangles - const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index]; - const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index]; + const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index]; + const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index]; - for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it) + for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it) { unsigned int tri = *it; assert(!emitted_flags[tri]); @@ -412,11 +412,11 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i { const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top; - // emit all vertex neighbours - const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex]; - const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex]; + // emit all vertex neighbors + const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex]; + const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex]; - for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it) + for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it) { unsigned int triangle = *it; @@ -461,7 +461,7 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top; // get next vertex - current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size); + current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size); if (current_vertex == ~0u) { diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp index 7925ea862c..4bd11121d2 100644 --- a/thirdparty/meshoptimizer/vertexcodec.cpp +++ b/thirdparty/meshoptimizer/vertexcodec.cpp @@ -50,6 +50,12 @@ #define SIMD_TARGET #endif +// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap +// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs +#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) +#define SIMD_LATENCYOPT +#endif + #endif // !MESHOPTIMIZER_NO_SIMD #ifdef SIMD_SSE @@ -472,6 +478,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi typedef int unaligned_int; #endif +#ifdef SIMD_LATENCYOPT + unsigned int data32; + memcpy(&data32, data, 4); + data32 &= data32 >> 1; + + // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 + unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4)); @@ -490,11 +508,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); +#ifdef SIMD_LATENCYOPT + return data + 4 + datacnt; +#else return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 2: { +#ifdef SIMD_LATENCYOPT + unsigned long long data64; + memcpy(&data64, data, 8); + data64 &= data64 >> 1; + data64 &= data64 >> 2; + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data)); __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8)); @@ -512,7 +544,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); +#ifdef SIMD_LATENCYOPT + return data + 8 + datacnt; +#else return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 3: @@ -604,24 +640,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) { - static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; - - uint8x16_t byte_mask = vld1q_u8(byte_mask_data); - uint8x16_t masked = vandq_u8(mask, byte_mask); + // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 + const uint64_t magic = 0x000103070f1f3f80ull; -#ifdef __aarch64__ - // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc - mask0 = vaddv_u8(vget_low_u8(masked)); - mask1 = vaddv_u8(vget_high_u8(masked)); -#else - // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8) - uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked)); - uint8x8_t sum2 = vpadd_u8(sum1, sum1); - uint8x8_t sum3 = vpadd_u8(sum2, sum2); + uint64x2_t mask2 = vreinterpretq_u64_u8(mask); - mask0 = vget_lane_u8(sum3, 0); - mask1 = vget_lane_u8(sum3, 1); -#endif + mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56); + mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56); } static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) @@ -639,6 +664,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi case 1: { +#ifdef SIMD_LATENCYOPT + unsigned int data32; + memcpy(&data32, data, 4); + data32 &= data32 >> 1; + + // arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32 + unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff); + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + uint8x8_t sel2 = vld1_u8(data); uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0]; uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22); @@ -655,11 +692,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi vst1q_u8(buffer, result); +#ifdef SIMD_LATENCYOPT + return data + 4 + datacnt; +#else return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 2: { +#ifdef SIMD_LATENCYOPT + unsigned long long data64; + memcpy(&data64, data, 8); + data64 &= data64 >> 1; + data64 &= data64 >> 2; + + // adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3 + int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60); +#endif + uint8x8_t sel4 = vld1_u8(data); uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15))); uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]); @@ -675,7 +726,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi vst1q_u8(buffer, result); +#ifdef SIMD_LATENCYOPT + return data + 8 + datacnt; +#else return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; +#endif } case 3: @@ -715,7 +770,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1 // magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00 const uint64_t magic = 0x000103070f1f3f80ull; - // TODO: This can use v8x16_bitmask in the future mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56); mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56); } diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp index 606a280aa9..14a73b1ddd 100644 --- a/thirdparty/meshoptimizer/vertexfilter.cpp +++ b/thirdparty/meshoptimizer/vertexfilter.cpp @@ -931,7 +931,7 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in const float* v = &data[i * stride_float]; unsigned int* d = &destination[i * stride_float]; - // use maximum exponent to encode values; this guarantess that mantissa is [-1, 1] + // use maximum exponent to encode values; this guarantees that mantissa is [-1, 1] int exp = -100; for (size_t j = 0; j < stride_float; ++j) |