61 files changed, 668 insertions, 463 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 38001b8782..f883a3a6da 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -346,7 +346,7 @@ Files extracted from upstream source:
 ## libwebp
 
 - Upstream: https://chromium.googlesource.com/webm/libwebp/
-- Version: 1.2.4 (0d1f12546bd803099a60c070517a552483f3790e, 2022)
+- Version: 1.3.0 (b557776962a3dcc985d83bd4ed94e1e2e50d0fa2, 2022)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
@@ -376,7 +376,7 @@ File extracted from upstream release tarball:
 ## meshoptimizer
 
 - Upstream: https://github.com/zeux/meshoptimizer
-- Version: git (ea4558d1c0f217f1d67ed7fe0b07896ece88ae18, 2022)
+- Version: git (4a287848fd664ae1c3fc8e5e008560534ceeb526, 2022)
 - License: MIT
 
 Files extracted from upstream repository:
diff --git a/thirdparty/enet/enet/godot_ext.h b/thirdparty/enet/enet/godot_ext.h
index 648f3d2f24..06a621b790 100644
--- a/thirdparty/enet/enet/godot_ext.h
+++ b/thirdparty/enet/enet/godot_ext.h
@@ -11,8 +11,8 @@
 */
 ENET_API void enet_address_set_ip(ENetAddress * address, const uint8_t * ip, size_t size);
 
-ENET_API int enet_host_dtls_server_setup (ENetHost *, void *, void *);
-ENET_API int enet_host_dtls_client_setup (ENetHost *, void *, uint8_t, const char *);
+ENET_API int enet_host_dtls_server_setup (ENetHost *, void *);
+ENET_API int enet_host_dtls_client_setup (ENetHost *, const char *, void *);
 ENET_API void enet_host_refuse_new_connections (ENetHost *, int);
 
 #endif // __ENET_GODOT_EXT_H__
diff --git a/thirdparty/enet/godot.cpp b/thirdparty/enet/godot.cpp
index 47298dcf6a..ea7f4957a2 100644
--- a/thirdparty/enet/godot.cpp
+++ b/thirdparty/enet/godot.cpp
@@ -164,16 +164,14 @@ class ENetDTLSClient : public ENetGodotSocket {
 	bool connected = false;
 	Ref<PacketPeerUDP> udp;
 	Ref<PacketPeerDTLS> dtls;
-	bool verify = false;
+	Ref<TLSOptions> tls_options;
 	String for_hostname;
-	Ref<X509Certificate> cert;
 	IPAddress local_address;
 
 public:
-	ENetDTLSClient(ENetUDP *p_base, Ref<X509Certificate> p_cert, bool p_verify, String p_for_hostname) {
-		verify = p_verify;
+	ENetDTLSClient(ENetUDP *p_base, String p_for_hostname, Ref<TLSOptions> p_options) {
 		for_hostname = p_for_hostname;
-		cert = p_cert;
+		tls_options = p_options;
 		udp.instantiate();
 		dtls = Ref<PacketPeerDTLS>(PacketPeerDTLS::create());
 		if (p_base->bound) {
@@ -205,7 +203,7 @@ public:
 	Error sendto(const uint8_t *p_buffer, int p_len, int &r_sent, IPAddress p_ip, uint16_t p_port) {
 		if (!connected) {
 			udp->connect_to_host(p_ip, p_port);
-			if (dtls->connect_to_peer(udp, verify, for_hostname, cert)) {
+			if (dtls->connect_to_peer(udp, for_hostname, tls_options)) {
 				return FAILED;
 			}
 			connected = true;
@@ -265,7 +263,7 @@ class ENetDTLSServer : public ENetGodotSocket {
 	IPAddress local_address;
 
 public:
-	ENetDTLSServer(ENetUDP *p_base, Ref<CryptoKey> p_key, Ref<X509Certificate> p_cert) {
+	ENetDTLSServer(ENetUDP *p_base, Ref<TLSOptions> p_options) {
 		udp_server.instantiate();
 		if (p_base->bound) {
 			uint16_t port;
@@ -274,7 +272,7 @@ public:
 			bind(local_address, port);
 		}
 		server = Ref<DTLSServer>(DTLSServer::create());
-		server->setup(p_key, p_cert);
+		server->setup(p_options);
 	}
 
 	~ENetDTLSServer() {
@@ -437,22 +435,22 @@ ENetSocket enet_socket_create(ENetSocketType type) {
 	return socket;
 }
 
-int enet_host_dtls_server_setup(ENetHost *host, void *p_key, void *p_cert) {
+int enet_host_dtls_server_setup(ENetHost *host, void *p_options) {
 	ENetGodotSocket *sock = (ENetGodotSocket *)host->socket;
 	if (!sock->can_upgrade()) {
 		return -1;
 	}
-	host->socket = memnew(ENetDTLSServer((ENetUDP *)sock, Ref<CryptoKey>((CryptoKey *)p_key), Ref<X509Certificate>((X509Certificate *)p_cert)));
+	host->socket = memnew(ENetDTLSServer(static_cast<ENetUDP *>(sock), Ref<TLSOptions>(static_cast<TLSOptions *>(p_options))));
 	memdelete(sock);
 	return 0;
 }
 
-int enet_host_dtls_client_setup(ENetHost *host, void *p_cert, uint8_t p_verify, const char *p_for_hostname) {
+int enet_host_dtls_client_setup(ENetHost *host, const char *p_for_hostname, void *p_options) {
 	ENetGodotSocket *sock = (ENetGodotSocket *)host->socket;
 	if (!sock->can_upgrade()) {
 		return -1;
 	}
-	host->socket = memnew(ENetDTLSClient((ENetUDP *)sock, Ref<X509Certificate>((X509Certificate *)p_cert), p_verify, String::utf8(p_for_hostname)));
+	host->socket = memnew(ENetDTLSClient(static_cast<ENetUDP *>(sock), String::utf8(p_for_hostname), Ref<TLSOptions>(static_cast<TLSOptions *>(p_options))));
 	memdelete(sock);
 	return 0;
 }
diff --git a/thirdparty/libwebp/AUTHORS b/thirdparty/libwebp/AUTHORS
index 3efcbe25b6..2f0c537d1c 100644
--- a/thirdparty/libwebp/AUTHORS
+++ b/thirdparty/libwebp/AUTHORS
@@ -11,11 +11,13 @@ Contributors:
 - Djordje Pesut (djordje dot pesut at imgtec dot com)
 - Frank Barchard (fbarchard at google dot com)
 - Hui Su (huisu at google dot com)
+- H. Vetinari (h dot vetinari at gmx dot com)
 - Ilya Kurdyukov (jpegqs at gmail dot com)
 - Ingvar Stepanyan (rreverser at google dot com)
 - James Zern (jzern at google dot com)
 - Jan Engelhardt (jengelh at medozas dot de)
 - Jehan (jehan at girinstud dot io)
+- Jeremy Maitin-Shepard (jbms at google dot com)
 - Johann Koenig (johann dot koenig at duck dot com)
 - Jovan Zelincevic (jovan dot zelincevic at imgtec dot com)
 - Jyrki Alakuijala (jyrki at google dot com)
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv.c b/thirdparty/libwebp/sharpyuv/sharpyuv.c
index 8b3ab7216b..7de34fb0b2 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv.c
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv.c
@@ -15,16 +15,22 @@
 
 #include <assert.h>
 #include <limits.h>
-#include <math.h>
+#include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "src/webp/types.h"
-#include "src/dsp/cpu.h"
+#include "sharpyuv/sharpyuv_cpu.h"
 #include "sharpyuv/sharpyuv_dsp.h"
 #include "sharpyuv/sharpyuv_gamma.h"
 
 //------------------------------------------------------------------------------
+
+int SharpYuvGetVersion(void) {
+  return SHARPYUV_VERSION;
+}
+
+//------------------------------------------------------------------------------
 // Sharp RGB->YUV conversion
 
 static const int kNumIterations = 4;
@@ -414,24 +420,45 @@ static int DoSharpArgbToYuv(const uint8_t* r_ptr, const uint8_t* g_ptr,
 }
 #undef SAFE_ALLOC
 
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define LOCK_ACCESS \
+    static pthread_mutex_t sharpyuv_lock = PTHREAD_MUTEX_INITIALIZER; \
+    if (pthread_mutex_lock(&sharpyuv_lock)) return
+#define UNLOCK_ACCESS_AND_RETURN                  \
+    do {                                          \
+      (void)pthread_mutex_unlock(&sharpyuv_lock); \
+      return;                                     \
+    } while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define LOCK_ACCESS do {} while (0)
+#define UNLOCK_ACCESS_AND_RETURN return
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
 // Hidden exported init function.
-// By default SharpYuvConvert calls it with NULL. If needed, users can declare
-// it as extern and call it with a VP8CPUInfo function.
-extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
+// By default SharpYuvConvert calls it with SharpYuvGetCPUInfo. If needed,
+// users can declare it as extern and call it with an alternate VP8CPUInfo
+// function.
+SHARPYUV_EXTERN void SharpYuvInit(VP8CPUInfo cpu_info_func);
 void SharpYuvInit(VP8CPUInfo cpu_info_func) {
   static volatile VP8CPUInfo sharpyuv_last_cpuinfo_used =
       (VP8CPUInfo)&sharpyuv_last_cpuinfo_used;
-  const int initialized =
-      (sharpyuv_last_cpuinfo_used != (VP8CPUInfo)&sharpyuv_last_cpuinfo_used);
-  if (cpu_info_func == NULL && initialized) return;
-  if (sharpyuv_last_cpuinfo_used == cpu_info_func) return;
-
-  SharpYuvInitDsp(cpu_info_func);
-  if (!initialized) {
-    SharpYuvInitGammaTables();
+  LOCK_ACCESS;
+  // Only update SharpYuvGetCPUInfo when called from external code to avoid a
+  // race on reading the value in SharpYuvConvert().
+  if (cpu_info_func != (VP8CPUInfo)&SharpYuvGetCPUInfo) {
+    SharpYuvGetCPUInfo = cpu_info_func;
+  }
+  if (sharpyuv_last_cpuinfo_used == SharpYuvGetCPUInfo) {
+    UNLOCK_ACCESS_AND_RETURN;
   }
 
-  sharpyuv_last_cpuinfo_used = cpu_info_func;
+  SharpYuvInitDsp();
+  SharpYuvInitGammaTables();
+
+  sharpyuv_last_cpuinfo_used = SharpYuvGetCPUInfo;
+  UNLOCK_ACCESS_AND_RETURN;
 }
 
 int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
@@ -467,7 +494,8 @@ int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
     // Stride should be even for uint16_t buffers.
     return 0;
   }
-  SharpYuvInit(NULL);
+  // The address of the function pointer is used to avoid a read race.
+  SharpYuvInit((VP8CPUInfo)&SharpYuvGetCPUInfo);
 
   // Add scaling factor to go from rgb_bit_depth to yuv_bit_depth, to the
   // rgb->yuv conversion matrix.
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv.h b/thirdparty/libwebp/sharpyuv/sharpyuv.h
index 9386ea2185..181b20a0bc 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv.h
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv.h
@@ -12,15 +12,31 @@
 #ifndef WEBP_SHARPYUV_SHARPYUV_H_
 #define WEBP_SHARPYUV_SHARPYUV_H_
 
-#include <inttypes.h>
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#ifndef SHARPYUV_EXTERN
+#ifdef WEBP_EXTERN
+#define SHARPYUV_EXTERN WEBP_EXTERN
+#else
+// This explicitly marks library functions and allows for changing the
+// signature for e.g., Windows DLL builds.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#define SHARPYUV_EXTERN extern __attribute__((visibility("default")))
+#else
+#if defined(_MSC_VER) && defined(WEBP_DLL)
+#define SHARPYUV_EXTERN __declspec(dllexport)
+#else
+#define SHARPYUV_EXTERN extern
+#endif /* _MSC_VER && WEBP_DLL */
+#endif /* __GNUC__ >= 4 */
+#endif /* WEBP_EXTERN */
+#endif /* SHARPYUV_EXTERN */
+
 // SharpYUV API version following the convention from semver.org
 #define SHARPYUV_VERSION_MAJOR 0
-#define SHARPYUV_VERSION_MINOR 1
+#define SHARPYUV_VERSION_MINOR 2
 #define SHARPYUV_VERSION_PATCH 0
 // Version as a uint32_t. The major number is the high 8 bits.
 // The minor number is the middle 8 bits. The patch number is the low 16 bits.
@@ -30,6 +46,10 @@ extern "C" {
   SHARPYUV_MAKE_VERSION(SHARPYUV_VERSION_MAJOR, SHARPYUV_VERSION_MINOR, \
                         SHARPYUV_VERSION_PATCH)
 
+// Returns the library's version number, packed in hexadecimal. See
+// SHARPYUV_VERSION.
+SHARPYUV_EXTERN int SharpYuvGetVersion(void);
+
 // RGB to YUV conversion matrix, in 16 bit fixed point.
 // y = rgb_to_y[0] * r + rgb_to_y[1] * g + rgb_to_y[2] * b + rgb_to_y[3]
 // u = rgb_to_u[0] * r + rgb_to_u[1] * g + rgb_to_u[2] * b + rgb_to_u[3]
@@ -65,11 +85,13 @@ typedef struct {
 //     adjacent pixels on the y, u and v channels. If yuv_bit_depth > 8, they
 //     should be multiples of 2.
 // width, height: width and height of the image in pixels
-int SharpYuvConvert(const void* r_ptr, const void* g_ptr, const void* b_ptr,
-                    int rgb_step, int rgb_stride, int rgb_bit_depth,
-                    void* y_ptr, int y_stride, void* u_ptr, int u_stride,
-                    void* v_ptr, int v_stride, int yuv_bit_depth, int width,
-                    int height, const SharpYuvConversionMatrix* yuv_matrix);
+SHARPYUV_EXTERN int SharpYuvConvert(const void* r_ptr, const void* g_ptr,
+                                    const void* b_ptr, int rgb_step,
+                                    int rgb_stride, int rgb_bit_depth,
+                                    void* y_ptr, int y_stride, void* u_ptr,
+                                    int u_stride, void* v_ptr, int v_stride,
+                                    int yuv_bit_depth, int width, int height,
+                                    const SharpYuvConversionMatrix* yuv_matrix);
 
 // TODO(b/194336375): Add YUV444 to YUV420 conversion. Maybe also add 422
 // support (it's rarely used in practice, especially for images).
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.c b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.c
new file mode 100644
index 0000000000..29425a0c49
--- /dev/null
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.c
@@ -0,0 +1,14 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+#include "sharpyuv/sharpyuv_cpu.h"
+
+// Include src/dsp/cpu.c to create SharpYuvGetCPUInfo from VP8GetCPUInfo. The
+// function pointer is renamed in sharpyuv_cpu.h.
+#include "src/dsp/cpu.c"
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.h b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.h
new file mode 100644
index 0000000000..176ca3eb16
--- /dev/null
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_cpu.h
@@ -0,0 +1,22 @@
+// Copyright 2022 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+#ifndef WEBP_SHARPYUV_SHARPYUV_CPU_H_
+#define WEBP_SHARPYUV_SHARPYUV_CPU_H_
+
+#include "sharpyuv/sharpyuv.h"
+
+// Avoid exporting SharpYuvGetCPUInfo in shared object / DLL builds.
+// SharpYuvInit() replaces the use of the function pointer.
+#undef WEBP_EXTERN
+#define WEBP_EXTERN extern
+#define VP8GetCPUInfo SharpYuvGetCPUInfo
+#include "src/dsp/cpu.h"
+
+#endif  // WEBP_SHARPYUV_SHARPYUV_CPU_H_
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c
index 5334fa64fa..0ad22be945 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.c
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 #include <math.h>
-#include <string.h>
+#include <stddef.h>
 
 static int ToFixed16(float f) { return (int)floor(f * (1 << 16) + 0.5f); }
 
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h
index 63c99ef5cd..3214e3ac60 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_csp.h
@@ -35,8 +35,9 @@ typedef struct {
 } SharpYuvColorSpace;
 
 // Fills in 'matrix' for the given YUVColorSpace.
-void SharpYuvComputeConversionMatrix(const SharpYuvColorSpace* yuv_color_space,
-                                     SharpYuvConversionMatrix* matrix);
+SHARPYUV_EXTERN void SharpYuvComputeConversionMatrix(
+    const SharpYuvColorSpace* yuv_color_space,
+    SharpYuvConversionMatrix* matrix);
 
 // Enums for precomputed conversion matrices.
 typedef enum {
@@ -49,7 +50,7 @@ typedef enum {
 } SharpYuvMatrixType;
 
 // Returns a pointer to a matrix for one of the predefined colorspaces.
-const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
+SHARPYUV_EXTERN const SharpYuvConversionMatrix* SharpYuvGetConversionMatrix(
     SharpYuvMatrixType matrix_type);
 
 #ifdef __cplusplus
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c
index 956fa7ce55..31c272c408 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.c
@@ -16,7 +16,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "src/dsp/cpu.h"
+#include "sharpyuv/sharpyuv_cpu.h"
 
 //-----------------------------------------------------------------------------
 
@@ -75,23 +75,24 @@ void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
 extern void InitSharpYuvSSE2(void);
 extern void InitSharpYuvNEON(void);
 
-void SharpYuvInitDsp(VP8CPUInfo cpu_info_func) {
-  (void)cpu_info_func;
-
+void SharpYuvInitDsp(void) {
 #if !WEBP_NEON_OMIT_C_CODE
   SharpYuvUpdateY = SharpYuvUpdateY_C;
   SharpYuvUpdateRGB = SharpYuvUpdateRGB_C;
   SharpYuvFilterRow = SharpYuvFilterRow_C;
 #endif
 
+  if (SharpYuvGetCPUInfo != NULL) {
 #if defined(WEBP_HAVE_SSE2)
-  if (cpu_info_func == NULL || cpu_info_func(kSSE2)) {
-    InitSharpYuvSSE2();
-  }
+    if (SharpYuvGetCPUInfo(kSSE2)) {
+      InitSharpYuvSSE2();
+    }
 #endif  // WEBP_HAVE_SSE2
+  }
 
 #if defined(WEBP_HAVE_NEON)
-  if (WEBP_NEON_OMIT_C_CODE || cpu_info_func == NULL || cpu_info_func(kNEON)) {
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (SharpYuvGetCPUInfo != NULL && SharpYuvGetCPUInfo(kNEON))) {
     InitSharpYuvNEON();
   }
 #endif  // WEBP_HAVE_NEON
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h
index e561d8d3d0..805fbadbf6 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_dsp.h
@@ -12,9 +12,8 @@
 #ifndef WEBP_SHARPYUV_SHARPYUV_DSP_H_
 #define WEBP_SHARPYUV_SHARPYUV_DSP_H_
 
-#include <stdint.h>
-
-#include "src/dsp/cpu.h"
+#include "sharpyuv/sharpyuv_cpu.h"
+#include "src/webp/types.h"
 
 extern uint64_t (*SharpYuvUpdateY)(const uint16_t* src, const uint16_t* ref,
                                    uint16_t* dst, int len, int bit_depth);
@@ -24,6 +23,6 @@ extern void (*SharpYuvFilterRow)(const int16_t* A, const int16_t* B, int len,
                                  const uint16_t* best_y, uint16_t* out,
                                  int bit_depth);
 
-void SharpYuvInitDsp(VP8CPUInfo cpu_info_func);
+void SharpYuvInitDsp(void);
 
 #endif  // WEBP_SHARPYUV_SHARPYUV_DSP_H_
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c
index 05b5436f83..20ab2da6bc 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.c
@@ -13,7 +13,6 @@
 
 #include <assert.h>
 #include <math.h>
-#include <stdint.h>
 
 #include "src/webp/types.h"
 
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h
index 2f1a3ff4a0..d13aff59e1 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_gamma.h
@@ -12,7 +12,7 @@
 #ifndef WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
 #define WEBP_SHARPYUV_SHARPYUV_GAMMA_H_
 
-#include <stdint.h>
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c b/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c
index 5cf6aaffb0..5840914865 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_neon.c
@@ -17,11 +17,6 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <arm_neon.h>
-#endif
-
-extern void InitSharpYuvNEON(void);
-
-#if defined(WEBP_USE_NEON)
 
 static uint16_t clip_NEON(int v, int max) {
   return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
@@ -169,6 +164,8 @@ static void SharpYuvFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
 
 //------------------------------------------------------------------------------
 
+extern void InitSharpYuvNEON(void);
+
 WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
   SharpYuvUpdateY = SharpYuvUpdateY_NEON;
   SharpYuvUpdateRGB = SharpYuvUpdateRGB_NEON;
@@ -177,6 +174,8 @@ WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvNEON(void) {
 
 #else  // !WEBP_USE_NEON
 
+extern void InitSharpYuvNEON(void);
+
 void InitSharpYuvNEON(void) {}
 
 #endif  // WEBP_USE_NEON
diff --git a/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c b/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c
index 1943873748..9744d1bb6c 100644
--- a/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c
+++ b/thirdparty/libwebp/sharpyuv/sharpyuv_sse2.c
@@ -16,11 +16,6 @@
 #if defined(WEBP_USE_SSE2)
 #include <stdlib.h>
 #include <emmintrin.h>
-#endif
-
-extern void InitSharpYuvSSE2(void);
-
-#if defined(WEBP_USE_SSE2)
 
 static uint16_t clip_SSE2(int v, int max) {
   return (v < 0) ? 0 : (v > max) ? max : (uint16_t)v;
@@ -199,6 +194,8 @@ WEBP_TSAN_IGNORE_FUNCTION void InitSharpYuvSSE2(void) {
 }
 #else  // !WEBP_USE_SSE2
 
+extern void InitSharpYuvSSE2(void);
+
 void InitSharpYuvSSE2(void) {}
 
 #endif  // WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/src/dec/vp8i_dec.h b/thirdparty/libwebp/src/dec/vp8i_dec.h
index 30c1bd3ef9..83791ecd25 100644
--- a/thirdparty/libwebp/src/dec/vp8i_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8i_dec.h
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 2
-#define DEC_REV_VERSION 4
+#define DEC_MIN_VERSION 3
+#define DEC_REV_VERSION 0
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
diff --git a/thirdparty/libwebp/src/dec/vp8l_dec.c b/thirdparty/libwebp/src/dec/vp8l_dec.c
index 1348055128..c0ea0181e5 100644
--- a/thirdparty/libwebp/src/dec/vp8l_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8l_dec.c
@@ -1336,7 +1336,7 @@ static int ReadTransform(int* const xsize, int const* ysize,
        ok = ok && ExpandColorMap(num_colors, transform);
       break;
     }
-    case SUBTRACT_GREEN:
+    case SUBTRACT_GREEN_TRANSFORM:
       break;
     default:
       assert(0);    // can't happen
diff --git a/thirdparty/libwebp/src/dec/webp_dec.c b/thirdparty/libwebp/src/dec/webp_dec.c
index 77a54c55d2..3f4f7bb659 100644
--- a/thirdparty/libwebp/src/dec/webp_dec.c
+++ b/thirdparty/libwebp/src/dec/webp_dec.c
@@ -179,7 +179,7 @@ static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
       return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
     }
     // For odd-sized chunk-payload, there's one byte padding at the end.
-    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
+    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1u;
     total_size += disk_chunk_size;
 
     // Check that total bytes skipped so far does not exceed riff_size.
diff --git a/thirdparty/libwebp/src/demux/demux.c b/thirdparty/libwebp/src/demux/demux.c
index 41387ec2d6..324e5eb993 100644
--- a/thirdparty/libwebp/src/demux/demux.c
+++ b/thirdparty/libwebp/src/demux/demux.c
@@ -24,8 +24,8 @@
 #include "src/webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 2
-#define DMUX_REV_VERSION 4
+#define DMUX_MIN_VERSION 3
+#define DMUX_REV_VERSION 0
 
 typedef struct {
   size_t start_;        // start location of the data
diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c
index a5f8c9f7c7..f0843d0feb 100644
--- a/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -26,8 +26,8 @@ static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha,
   uint32_t alpha_and = 0xff;
   int i, j;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
-  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  const __m128i rgb_mask = _mm_set1_epi32((int)0xffffff00);  // to preserve RGB
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
@@ -106,8 +106,8 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
-  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
-  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  const __m128i a_mask = _mm_set1_epi32(0xff);  // to preserve alpha
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
@@ -178,7 +178,7 @@ static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride,
 static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
                                     int w, int h, int stride) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i kMult = _mm_set1_epi16(0x8081u);
+  const __m128i kMult = _mm_set1_epi16((short)0x8081);
   const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);
   const int kSpan = 4;
   while (h-- > 0) {
@@ -267,7 +267,7 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
 }
 
 static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
-  const __m128i m_color = _mm_set1_epi32(color);
+  const __m128i m_color = _mm_set1_epi32((int)color);
   const __m128i zero = _mm_setzero_si128();
   int i = 0;
   for (; i + 8 <= length; i += 8) {
diff --git a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c
index cdf877ce49..1156ac3417 100644
--- a/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -26,7 +26,7 @@ static int ExtractAlpha_SSE41(const uint8_t* WEBP_RESTRICT argb,
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
   int i, j;
-  const __m128i all_0xff = _mm_set1_epi32(~0u);
+  const __m128i all_0xff = _mm_set1_epi32(~0);
   __m128i all_alphas = all_0xff;
 
   // We must be able to access 3 extra bytes after the last written byte
diff --git a/thirdparty/libwebp/src/dsp/cpu.c b/thirdparty/libwebp/src/dsp/cpu.c
index a4ba7f2cb7..62de73f750 100644
--- a/thirdparty/libwebp/src/dsp/cpu.c
+++ b/thirdparty/libwebp/src/dsp/cpu.c
@@ -212,7 +212,7 @@ VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
 #elif defined(WEBP_HAVE_NEON)
 // In most cases this function doesn't check for NEON support (it's assumed by
 // the configuration), but enables turning off NEON at runtime, for testing
-// purposes, by setting VP8DecGetCPUInfo = NULL.
+// purposes, by setting VP8GetCPUInfo = NULL.
 static int armCPUInfo(CPUFeature feature) {
   if (feature != kNEON) return 0;
 #if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
diff --git a/thirdparty/libwebp/src/dsp/cpu.h b/thirdparty/libwebp/src/dsp/cpu.h
index 57a40d87d4..be80727c0d 100644
--- a/thirdparty/libwebp/src/dsp/cpu.h
+++ b/thirdparty/libwebp/src/dsp/cpu.h
@@ -14,6 +14,8 @@
 #ifndef WEBP_DSP_CPU_H_
 #define WEBP_DSP_CPU_H_
 
+#include <stddef.h>
+
 #ifdef HAVE_CONFIG_H
 #include "src/webp/config.h"
 #endif
diff --git a/thirdparty/libwebp/src/dsp/dec_sse2.c b/thirdparty/libwebp/src/dsp/dec_sse2.c
index 873aa59e8a..01e6bcb636 100644
--- a/thirdparty/libwebp/src/dsp/dec_sse2.c
+++ b/thirdparty/libwebp/src/dsp/dec_sse2.c
@@ -158,10 +158,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
       dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
     } else {
       // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS));
     }
     // Convert to 16b.
     dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -187,10 +187,10 @@ static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
       _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
     } else {
       // Store four bytes/pixels per line.
-      WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
-      WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
-      WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
-      WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
+      WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+      WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+      WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+      WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
     }
   }
 }
@@ -213,10 +213,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const __m128i m3 = _mm_subs_epi16(B, d4);
   const __m128i zero = _mm_setzero_si128();
   // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
+  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToInt32(dst + 3 * BPS));
   // Convert to 16b.
   dst0 = _mm_unpacklo_epi8(dst0, zero);
   dst1 = _mm_unpacklo_epi8(dst1, zero);
@@ -233,10 +233,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   dst2 = _mm_packus_epi16(dst2, dst2);
   dst3 = _mm_packus_epi16(dst3, dst3);
   // Store the results.
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
 }
 #undef MUL
 #endif   // USE_TRANSFORM_AC3
@@ -477,11 +477,11 @@ static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
   // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
   // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
   const __m128i A0 = _mm_set_epi32(
-      WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]),
-      WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride]));
+      WebPMemToInt32(&b[6 * stride]), WebPMemToInt32(&b[2 * stride]),
+      WebPMemToInt32(&b[4 * stride]), WebPMemToInt32(&b[0 * stride]));
   const __m128i A1 = _mm_set_epi32(
-      WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]),
-      WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride]));
+      WebPMemToInt32(&b[7 * stride]), WebPMemToInt32(&b[3 * stride]),
+      WebPMemToInt32(&b[5 * stride]), WebPMemToInt32(&b[1 * stride]));
 
   // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
   // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
@@ -540,7 +540,7 @@ static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
                                       uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
+    WebPInt32ToMem(dst, _mm_cvtsi128_si32(*x));
     *x = _mm_srli_si128(*x, 4);
   }
 }
@@ -908,10 +908,10 @@ static void VE4_SSE2(uint8_t* dst) {    // vertical
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
   const __m128i b = _mm_subs_epu8(a, lsb);
   const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
-  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  const int vals = _mm_cvtsi128_si32(avg);
   int i;
   for (i = 0; i < 4; ++i) {
-    WebPUint32ToMem(dst + i * BPS, vals);
+    WebPInt32ToMem(dst + i * BPS, vals);
   }
 }
 
@@ -925,10 +925,10 @@ static void LD4_SSE2(uint8_t* dst) {   // Down-Left
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
@@ -946,10 +946,10 @@ static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
 
   // these two are hard to implement in SSE2, so we keep the C-version:
   DST(0, 2) = AVG3(J, I, X);
@@ -970,11 +970,12 @@ static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
   const __m128i abbc = _mm_or_si128(ab, bc);
   const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
   const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
-  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+  const uint32_t extra_out =
+      (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
 
   // these two are hard to get and irregular
   DST(3, 2) = (extra_out >> 0) & 0xff;
@@ -990,7 +991,7 @@ static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const uint32_t K = dst[-1 + 2 * BPS];
   const uint32_t L = dst[-1 + 3 * BPS];
   const __m128i LKJI_____ =
-      _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
+      _mm_cvtsi32_si128((int)(L | (K << 8) | (J << 16) | (I << 24)));
   const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
   const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
   const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
@@ -998,10 +999,10 @@ static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 #undef DST
@@ -1015,13 +1016,13 @@ static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
   const __m128i zero = _mm_setzero_si128();
   int y;
   if (size == 4) {
-    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
     const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
     for (y = 0; y < 4; ++y, dst += BPS) {
       const int val = dst[-1] - top[-1];
       const __m128i base = _mm_set1_epi16(val);
       const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
-      WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+      WebPInt32ToMem(dst, _mm_cvtsi128_si32(out));
     }
   } else if (size == 8) {
     const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
@@ -1062,7 +1063,7 @@ static void VE16_SSE2(uint8_t* dst) {
 static void HE16_SSE2(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
-    const __m128i values = _mm_set1_epi8(dst[-1]);
+    const __m128i values = _mm_set1_epi8((char)dst[-1]);
     _mm_storeu_si128((__m128i*)dst, values);
     dst += BPS;
   }
@@ -1070,7 +1071,7 @@ static void HE16_SSE2(uint8_t* dst) {     // horizontal
 
 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 16; ++j) {
     _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
   }
@@ -1130,7 +1131,7 @@ static void VE8uv_SSE2(uint8_t* dst) {    // vertical
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 8; ++j) {
     _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
   }
diff --git a/thirdparty/libwebp/src/dsp/dec_sse41.c b/thirdparty/libwebp/src/dsp/dec_sse41.c
index 8f18506d54..08a3630272 100644
--- a/thirdparty/libwebp/src/dsp/dec_sse41.c
+++ b/thirdparty/libwebp/src/dsp/dec_sse41.c
@@ -23,7 +23,7 @@ static void HE16_SSE41(uint8_t* dst) {     // horizontal
   int j;
   const __m128i kShuffle3 = _mm_set1_epi8(3);
   for (j = 16; j > 0; --j) {
-    const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4));
+    const __m128i in = _mm_cvtsi32_si128(WebPMemToInt32(dst - 4));
     const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
     _mm_storeu_si128((__m128i*)dst, values);
     dst += BPS;
diff --git a/thirdparty/libwebp/src/dsp/enc_neon.c b/thirdparty/libwebp/src/dsp/enc_neon.c
index 601962ba76..3a04111c55 100644
--- a/thirdparty/libwebp/src/dsp/enc_neon.c
+++ b/thirdparty/libwebp/src/dsp/enc_neon.c
@@ -764,9 +764,14 @@ static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
 
 // Horizontal sum of all four uint32_t values in 'sum'.
 static int SumToInt_NEON(uint32x4_t sum) {
+#if defined(__aarch64__)
+  return (int)vaddvq_u32(sum);
+#else
   const uint64x2_t sum2 = vpaddlq_u32(sum);
-  const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
-  return (int)sum3;
+  const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
+                                   vreinterpret_u32_u64(vget_high_u64(sum2)));
+  return (int)vget_lane_u32(sum3, 0);
+#endif
 }
 
 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
diff --git a/thirdparty/libwebp/src/dsp/enc_sse2.c b/thirdparty/libwebp/src/dsp/enc_sse2.c
index b2e78ed941..1d1055668f 100644
--- a/thirdparty/libwebp/src/dsp/enc_sse2.c
+++ b/thirdparty/libwebp/src/dsp/enc_sse2.c
@@ -156,10 +156,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
       ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
     } else {
       // Load four bytes/pixels per line.
-      ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
-      ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
-      ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
-      ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
+      ref0 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[0 * BPS]));
+      ref1 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[1 * BPS]));
+      ref2 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[2 * BPS]));
+      ref3 = _mm_cvtsi32_si128(WebPMemToInt32(&ref[3 * BPS]));
     }
     // Convert to 16b.
     ref0 = _mm_unpacklo_epi8(ref0, zero);
@@ -185,10 +185,10 @@ static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
       _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
     } else {
       // Store four bytes/pixels per line.
-      WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
-      WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
-      WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
-      WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
+      WebPInt32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
+      WebPInt32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
+      WebPInt32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
+      WebPInt32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
     }
   }
 }
@@ -481,7 +481,7 @@ static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 8; ++j) {
     _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
   }
@@ -489,7 +489,7 @@ static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
 
 static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
-  const __m128i values = _mm_set1_epi8(v);
+  const __m128i values = _mm_set1_epi8((char)v);
   for (j = 0; j < 16; ++j) {
     _mm_store_si128((__m128i*)(dst + j * BPS), values);
   }
@@ -540,7 +540,7 @@ static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
 static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 8; ++j) {
-    const __m128i values = _mm_set1_epi8(left[j]);
+    const __m128i values = _mm_set1_epi8((char)left[j]);
     _mm_storel_epi64((__m128i*)dst, values);
     dst += BPS;
   }
@@ -549,7 +549,7 @@ static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
 static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 16; ++j) {
-    const __m128i values = _mm_set1_epi8(left[j]);
+    const __m128i values = _mm_set1_epi8((char)left[j]);
     _mm_store_si128((__m128i*)dst, values);
     dst += BPS;
   }
@@ -722,10 +722,10 @@ static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
   const __m128i b = _mm_subs_epu8(a, lsb);
   const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
-  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  const int vals = _mm_cvtsi128_si32(avg);
   int i;
   for (i = 0; i < 4; ++i) {
-    WebPUint32ToMem(dst + i * BPS, vals);
+    WebPInt32ToMem(dst + i * BPS, vals);
   }
 }
 
@@ -760,10 +760,10 @@ static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
@@ -782,10 +782,10 @@ static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
 
   // these two are hard to implement in SSE2, so we keep the C-version:
   DST(0, 2) = AVG3(J, I, X);
@@ -807,11 +807,12 @@ static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
   const __m128i abbc = _mm_or_si128(ab, bc);
   const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
   const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
-  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+  const uint32_t extra_out =
+      (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
 
   // these two are hard to get and irregular
   DST(3, 2) = (extra_out >> 0) & 0xff;
@@ -829,10 +830,10 @@ static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
   const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
   const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
   const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
-  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
-  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
-  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
-  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+  WebPInt32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPInt32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPInt32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPInt32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
 static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
@@ -875,14 +876,14 @@ static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
 
 static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToInt32(top));
   const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
   int y;
   for (y = 0; y < 4; ++y, dst += BPS) {
     const int val = top[-2 - y] - top[-1];
     const __m128i base = _mm_set1_epi16(val);
     const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
-    WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+    WebPInt32ToMem(dst, _mm_cvtsi128_si32(out));
   }
 }
 
diff --git a/thirdparty/libwebp/src/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c
index 84a54296fd..fb86e58d4a 100644
--- a/thirdparty/libwebp/src/dsp/lossless.c
+++ b/thirdparty/libwebp/src/dsp/lossless.c
@@ -49,7 +49,7 @@ static WEBP_INLINE uint32_t Clip255(uint32_t a) {
 }
 
 static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
-  return Clip255(a + b - c);
+  return Clip255((uint32_t)(a + b - c));
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
@@ -66,7 +66,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
 }
 
 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
-  return Clip255(a + (a - b) / 2);
+  return Clip255((uint32_t)(a + (a - b) / 2));
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
@@ -293,10 +293,10 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
     const uint32_t red = argb >> 16;
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
-    new_red += ColorTransformDelta(m->green_to_red_, green);
+    new_red += ColorTransformDelta((int8_t)m->green_to_red_, green);
     new_red &= 0xff;
-    new_blue += ColorTransformDelta(m->green_to_blue_, green);
-    new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red);
+    new_blue += ColorTransformDelta((int8_t)m->green_to_blue_, green);
+    new_blue += ColorTransformDelta((int8_t)m->red_to_blue_, (int8_t)new_red);
     new_blue &= 0xff;
     dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -395,7 +395,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
   assert(row_start < row_end);
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
-    case SUBTRACT_GREEN:
+    case SUBTRACT_GREEN_TRANSFORM:
       VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
       break;
     case PREDICTOR_TRANSFORM:
diff --git a/thirdparty/libwebp/src/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c
index de6c4ace5f..b1f9f26d72 100644
--- a/thirdparty/libwebp/src/dsp/lossless_enc.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc.c
@@ -522,11 +522,11 @@ static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const int argb = argb_data[i];
+    const int argb = (int)argb_data[i];
     const int green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
     const uint32_t new_b = (((argb >>  0) & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
+    argb_data[i] = ((uint32_t)argb & 0xff00ff00u) | (new_r << 16) | new_b;
   }
 }
 
@@ -547,10 +547,10 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
     const int8_t red   = U32ToS8(argb >> 16);
     int new_red = red & 0xff;
     int new_blue = argb & 0xff;
-    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red -= ColorTransformDelta((int8_t)m->green_to_red_, green);
     new_red &= 0xff;
-    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
-    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue -= ColorTransformDelta((int8_t)m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta((int8_t)m->red_to_blue_, red);
     new_blue &= 0xff;
     data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
@@ -560,7 +560,7 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
                                              uint32_t argb) {
   const int8_t green = U32ToS8(argb >> 8);
   int new_red = argb >> 16;
-  new_red -= ColorTransformDelta(green_to_red, green);
+  new_red -= ColorTransformDelta((int8_t)green_to_red, green);
   return (new_red & 0xff);
 }
 
@@ -569,9 +569,9 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
                                               uint32_t argb) {
   const int8_t green = U32ToS8(argb >>  8);
   const int8_t red   = U32ToS8(argb >> 16);
-  uint8_t new_blue = argb & 0xff;
-  new_blue -= ColorTransformDelta(green_to_blue, green);
-  new_blue -= ColorTransformDelta(red_to_blue, red);
+  int new_blue = argb & 0xff;
+  new_blue -= ColorTransformDelta((int8_t)green_to_blue, green);
+  new_blue -= ColorTransformDelta((int8_t)red_to_blue, red);
   return (new_blue & 0xff);
 }
 
diff --git a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 948001a3d5..66cbaab772 100644
--- a/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -54,8 +54,8 @@ static void TransformColor_SSE2(const VP8LMultipliers* const m,
   const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
                                      CST_5b(m->green_to_blue_));
   const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
-  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
-  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);  // alpha-green masks
+  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);       // red-blue masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@@ -376,7 +376,7 @@ static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
       break;
     }
     case 2: {
-      const __m128i mask_or = _mm_set1_epi32(0xff000000);
+      const __m128i mask_or = _mm_set1_epi32((int)0xff000000);
       const __m128i mul_cst = _mm_set1_epi16(0x0104);
       const __m128i mask_mul = _mm_set1_epi16(0x0f00);
       for (x = 0; x + 16 <= width; x += 16, dst += 4) {
@@ -427,7 +427,7 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
 static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     const __m128i res = _mm_sub_epi8(src, black);
diff --git a/thirdparty/libwebp/src/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c
index 396cb0bdfc..4b6a532c23 100644
--- a/thirdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c
@@ -27,23 +27,22 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
                                                         uint32_t c1,
                                                         uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
+  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
   const __m128i V1 = _mm_add_epi16(C0, C1);
   const __m128i V2 = _mm_sub_epi16(V1, C2);
   const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(b);
 }
 
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
                                                         uint32_t c1,
                                                         uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c1), zero);
+  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)c2), zero);
   const __m128i avg = _mm_add_epi16(C1, C0);
   const __m128i A0 = _mm_srli_epi16(avg, 1);
   const __m128i A1 = _mm_sub_epi16(A0, B0);
@@ -52,16 +51,15 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
   const __m128i A3 = _mm_srai_epi16(A2, 1);
   const __m128i A4 = _mm_add_epi16(A0, A3);
   const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A5);
 }
 
 static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
   int pa_minus_pb;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
+  const __m128i A0 = _mm_cvtsi32_si128((int)a);
+  const __m128i B0 = _mm_cvtsi32_si128((int)b);
+  const __m128i C0 = _mm_cvtsi32_si128((int)c);
   const __m128i AC0 = _mm_subs_epu8(A0, C0);
   const __m128i CA0 = _mm_subs_epu8(C0, A0);
   const __m128i BC0 = _mm_subs_epu8(B0, C0);
@@ -94,8 +92,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
                                              __m128i* const avg) {
   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
   const __m128i ones = _mm_set1_epi8(1);
-  const __m128i A0 = _mm_cvtsi32_si128(a0);
-  const __m128i A1 = _mm_cvtsi32_si128(a1);
+  const __m128i A0 = _mm_cvtsi32_si128((int)a0);
+  const __m128i A1 = _mm_cvtsi32_si128((int)a1);
   const __m128i avg1 = _mm_avg_epu8(A0, A1);
   const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
   *avg = _mm_sub_epi8(avg1, one);
@@ -103,8 +101,8 @@ static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
 
 static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
-  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a0), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
   const __m128i sum = _mm_add_epi16(A1, A0);
   return _mm_srli_epi16(sum, 1);
 }
@@ -112,19 +110,18 @@ static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
 static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
   __m128i output;
   Average2_uint32_SSE2(a0, a1, &output);
-  return _mm_cvtsi128_si32(output);
+  return (uint32_t)_mm_cvtsi128_si32(output);
 }
 
 static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
                                           uint32_t a2) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
-  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
+  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128((int)a1), zero);
   const __m128i sum = _mm_add_epi16(avg1, A1);
   const __m128i avg2 = _mm_srli_epi16(sum, 1);
   const __m128i A2 = _mm_packus_epi16(avg2, avg2);
-  const uint32_t output = _mm_cvtsi128_si32(A2);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A2);
 }
 
 static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
@@ -134,8 +131,7 @@ static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
   const __m128i sum = _mm_add_epi16(avg2, avg1);
   const __m128i avg3 = _mm_srli_epi16(sum, 1);
   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
-  const uint32_t output = _mm_cvtsi128_si32(A0);
-  return output;
+  return (uint32_t)_mm_cvtsi128_si32(A0);
 }
 
 static uint32_t Predictor5_SSE2(const uint32_t* const left,
@@ -192,7 +188,7 @@ static uint32_t Predictor13_SSE2(const uint32_t* const left,
 static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  const __m128i black = _mm_set1_epi32((int)ARGB_BLACK);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     const __m128i res = _mm_add_epi8(src, black);
@@ -208,7 +204,7 @@ static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
 static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
   int i;
-  __m128i prev = _mm_set1_epi32(out[-1]);
+  __m128i prev = _mm_set1_epi32((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     // a | b | c | d
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -285,12 +281,12 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2
 
 // Predictor10: average of (average of (L,TL), average of (T, TR)).
-#define DO_PRED10(OUT) do {               \
-  __m128i avgLTL, avg;                    \
-  Average2_m128i(&L, &TL, &avgLTL);       \
-  Average2_m128i(&avgTTR, &avgLTL, &avg); \
-  L = _mm_add_epi8(avg, src);             \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
+#define DO_PRED10(OUT) do {                         \
+  __m128i avgLTL, avg;                              \
+  Average2_m128i(&L, &TL, &avgLTL);                 \
+  Average2_m128i(&avgTTR, &avgLTL, &avg);           \
+  L = _mm_add_epi8(avg, src);                       \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);  \
 } while (0)
 
 #define DO_PRED10_SHIFT do {                                  \
@@ -303,7 +299,7 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
-  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
@@ -336,7 +332,7 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
   const __m128i B = _mm_andnot_si128(mask, T);                         \
   const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
   L = _mm_add_epi8(src, pred);                                         \
-  out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(L);                     \
 } while (0)
 
 #define DO_PRED11_SHIFT do {                                \
@@ -351,7 +347,7 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
   __m128i pa;
-  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_cvtsi32_si128((int)out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
@@ -384,12 +380,12 @@ static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
 #undef DO_PRED11_SHIFT
 
 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT) do {            \
-  const __m128i all = _mm_add_epi16(L, (DIFF));    \
-  const __m128i alls = _mm_packus_epi16(all, all); \
-  const __m128i res = _mm_add_epi8(src, alls);     \
-  out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
-  L = _mm_unpacklo_epi8(res, zero);                \
+#define DO_PRED12(DIFF, LANE, OUT) do {              \
+  const __m128i all = _mm_add_epi16(L, (DIFF));      \
+  const __m128i alls = _mm_packus_epi16(all, all);   \
+  const __m128i res = _mm_add_epi8(src, alls);       \
+  out[i + (OUT)] = (uint32_t)_mm_cvtsi128_si32(res); \
+  L = _mm_unpacklo_epi8(res, zero);                  \
 } while (0)
 
 #define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
@@ -402,7 +398,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
   int i;
   const __m128i zero = _mm_setzero_si128();
-  const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
+  const __m128i L8 = _mm_cvtsi32_si128((int)out[-1]);
   __m128i L = _mm_unpacklo_epi8(L8, zero);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     // Load 4 pixels at a time.
@@ -468,7 +464,7 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
   const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
 #undef MK_CST_16
 #undef CST
-  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);  // alpha-green masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@@ -532,7 +528,7 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
 
 static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
                                    int num_pixels, uint8_t* dst) {
-  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
+  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ff);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
@@ -561,7 +557,7 @@ static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
 static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
                                        int num_pixels, uint8_t* dst) {
   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
-  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
@@ -596,8 +592,8 @@ static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
 
 static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
                                      int num_pixels, uint8_t* dst) {
-  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
-  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
+  const __m128i mask_0xe0 = _mm_set1_epi8((char)0xe0);
+  const __m128i mask_0xf8 = _mm_set1_epi8((char)0xf8);
   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
diff --git a/thirdparty/libwebp/src/dsp/lossless_sse41.c b/thirdparty/libwebp/src/dsp/lossless_sse41.c
index b0d6daa7fe..bb7ce7611f 100644
--- a/thirdparty/libwebp/src/dsp/lossless_sse41.c
+++ b/thirdparty/libwebp/src/dsp/lossless_sse41.c
@@ -25,11 +25,12 @@ static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
                                         int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
-  const __m128i mults_rb = _mm_set1_epi32((uint32_t)CST(green_to_red_) << 16 |
-                                          (CST(green_to_blue_) & 0xffff));
+  const __m128i mults_rb =
+      _mm_set1_epi32((int)((uint32_t)CST(green_to_red_) << 16 |
+                           (CST(green_to_blue_) & 0xffff)));
   const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue_));
 #undef CST
-  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);
+  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);
   const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
                                       -1, 9, -1, 9, -1, 13, -1, 13);
   const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
diff --git a/thirdparty/libwebp/src/dsp/quant.h b/thirdparty/libwebp/src/dsp/quant.h
index 5e8dba8d19..fc099bf9d6 100644
--- a/thirdparty/libwebp/src/dsp/quant.h
+++ b/thirdparty/libwebp/src/dsp/quant.h
@@ -21,10 +21,15 @@
 
 #define IsFlat IsFlat_NEON
 
-static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+static uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
+#if defined(__aarch64__)
+  return vaddvq_u32(a);
+#else
   const uint64x2_t b = vpaddlq_u32(a);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                  vreinterpret_u32_u64(vget_high_u64(b)));
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
 }
 
 static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
@@ -45,7 +50,7 @@ static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
 
     levels += 16;
   }
-  return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0);
+  return thresh >= (int)horizontal_add_uint32x4(sum);
 }
 
 #else
diff --git a/thirdparty/libwebp/src/dsp/rescaler_sse2.c b/thirdparty/libwebp/src/dsp/rescaler_sse2.c
index d7effea16e..3f18e94e93 100644
--- a/thirdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -85,7 +85,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
       const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
       const __m128i out = _mm_madd_epi16(cur_pixels, mult);
       assert(sizeof(*frow) == sizeof(uint32_t));
-      WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
+      WebPInt32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
       frow += 1;
       if (frow >= frow_end) break;
       accum -= wrk->x_sub;
@@ -132,7 +132,7 @@ static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
     __m128i base = zero;
     accum += wrk->x_add;
     while (accum > 0) {
-      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
+      const __m128i A = _mm_cvtsi32_si128(WebPMemToInt32(src));
       src += 4;
       base = _mm_unpacklo_epi8(A, zero);
       // To avoid overflow, we need: base * x_add / x_sub < 32768
@@ -198,7 +198,7 @@ static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
                                         const __m128i* const mult,
                                         uint8_t* const dst) {
   const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
-  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
+  const __m128i mask = _mm_set_epi32(~0, 0, ~0, 0);
   const __m128i B0 = _mm_mul_epu32(*A0, *mult);
   const __m128i B1 = _mm_mul_epu32(*A1, *mult);
   const __m128i B2 = _mm_mul_epu32(*A2, *mult);
diff --git a/thirdparty/libwebp/src/dsp/upsampling_sse2.c b/thirdparty/libwebp/src/dsp/upsampling_sse2.c
index 340f1e2ac2..08b6d0b1cf 100644
--- a/thirdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -121,7 +121,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
   int uv_pos, pos;                                                             \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
   uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
-  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~(uintptr_t)15);  \
   uint8_t* const r_v = r_u + 32;                                               \
                                                                                \
   assert(top_y != NULL);                                                       \
diff --git a/thirdparty/libwebp/src/dsp/yuv_sse2.c b/thirdparty/libwebp/src/dsp/yuv_sse2.c
index 970bbb7884..01a48f9af2 100644
--- a/thirdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/thirdparty/libwebp/src/dsp/yuv_sse2.c
@@ -15,10 +15,12 @@
 
 #if defined(WEBP_USE_SSE2)
 
-#include "src/dsp/common_sse2.h"
 #include <stdlib.h>
 #include <emmintrin.h>
 
+#include "src/dsp/common_sse2.h"
+#include "src/utils/utils.h"
+
 //-----------------------------------------------------------------------------
 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 
@@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
 // Load and replicate the U/V samples
 static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
 }
@@ -130,7 +132,7 @@ static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
   const __m128i rg0 = _mm_packus_epi16(*B, *A);
   const __m128i ba0 = _mm_packus_epi16(*R, *G);
 #endif
-  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
   const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0);  // rbrbrbrbrb...
   const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0);  // gagagagaga...
   const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
@@ -147,9 +149,10 @@ static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
   const __m128i r0 = _mm_packus_epi16(*R, *R);
   const __m128i g0 = _mm_packus_epi16(*G, *G);
   const __m128i b0 = _mm_packus_epi16(*B, *B);
-  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
+  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8));
   const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
-  const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
+  const __m128i g1 =
+      _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5);
   const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
   const __m128i rg = _mm_or_si128(r1, g1);
   const __m128i gb = _mm_or_si128(g2, b1);
diff --git a/thirdparty/libwebp/src/dsp/yuv_sse41.c b/thirdparty/libwebp/src/dsp/yuv_sse41.c
index 579d1f7402..f79b802e47 100644
--- a/thirdparty/libwebp/src/dsp/yuv_sse41.c
+++ b/thirdparty/libwebp/src/dsp/yuv_sse41.c
@@ -15,10 +15,12 @@
 
 #if defined(WEBP_USE_SSE41)
 
-#include "src/dsp/common_sse41.h"
 #include <stdlib.h>
 #include <smmintrin.h>
 
+#include "src/dsp/common_sse41.h"
+#include "src/utils/utils.h"
+
 //-----------------------------------------------------------------------------
 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 
@@ -74,7 +76,7 @@ static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
 // Load and replicate the U/V samples
 static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
   return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
 }
diff --git a/thirdparty/libwebp/src/enc/analysis_enc.c b/thirdparty/libwebp/src/enc/analysis_enc.c
index ebb784261c..a0001ac034 100644
--- a/thirdparty/libwebp/src/enc/analysis_enc.c
+++ b/thirdparty/libwebp/src/enc/analysis_enc.c
@@ -391,12 +391,14 @@ static int DoSegmentsJob(void* arg1, void* arg2) {
   return ok;
 }
 
+#ifdef WEBP_USE_THREAD
 static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
   int i;
   for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
   dst->alpha += src->alpha;
   dst->uv_alpha += src->uv_alpha;
 }
+#endif
 
 // initialize the job struct with some tasks to perform
 static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
@@ -425,10 +427,10 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
   if (do_segments) {
     const int last_row = enc->mb_h_;
-    // We give a little more than a half work to the main thread.
-    const int split_row = (9 * last_row + 15) >> 4;
     const int total_mb = last_row * enc->mb_w_;
 #ifdef WEBP_USE_THREAD
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
     const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
     const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
 #else
@@ -438,6 +440,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
         WebPGetWorkerInterface();
     SegmentJob main_job;
     if (do_mt) {
+#ifdef WEBP_USE_THREAD
       SegmentJob side_job;
       // Note the use of '&' instead of '&&' because we must call the functions
       // no matter what.
@@ -455,6 +458,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       }
       worker_interface->End(&side_job.worker);
       if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+#endif  // WEBP_USE_THREAD
     } else {
       // Even for single-thread case, we use the generic Worker tools.
       InitSegmentJob(enc, &main_job, 0, last_row);
diff --git a/thirdparty/libwebp/src/enc/picture_csp_enc.c b/thirdparty/libwebp/src/enc/picture_csp_enc.c
index fabebcf202..78c8ca479b 100644
--- a/thirdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_csp_enc.c
@@ -69,10 +69,12 @@ static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
 int WebPPictureHasTransparency(const WebPPicture* picture) {
   if (picture == NULL) return 0;
   if (picture->use_argb) {
-    const int alpha_offset = ALPHA_OFFSET;
-    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
-                          picture->width, picture->height,
-                          4, picture->argb_stride * sizeof(*picture->argb));
+    if (picture->argb != NULL) {
+      return CheckNonOpaque((const uint8_t*)picture->argb + ALPHA_OFFSET,
+                            picture->width, picture->height,
+                            4, picture->argb_stride * sizeof(*picture->argb));
+    }
+    return 0;
   }
   return CheckNonOpaque(picture->a, picture->width, picture->height,
                         1, picture->a_stride);
@@ -170,21 +172,6 @@ static const int kMinDimensionIterativeConversion = 4;
 //------------------------------------------------------------------------------
 // Main function
 
-extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
-
-static void SafeInitSharpYuv(void) {
-#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
-  static pthread_mutex_t initsharpyuv_lock = PTHREAD_MUTEX_INITIALIZER;
-  if (pthread_mutex_lock(&initsharpyuv_lock)) return;
-#endif
-
-  SharpYuvInit(VP8GetCPUInfo);
-
-#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
-  (void)pthread_mutex_unlock(&initsharpyuv_lock);
-#endif
-}
-
 static int PreprocessARGB(const uint8_t* r_ptr,
                           const uint8_t* g_ptr,
                           const uint8_t* b_ptr,
@@ -481,6 +468,8 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
   }
 }
 
+extern void SharpYuvInit(VP8CPUInfo cpu_info_func);
+
 static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
                               const uint8_t* g_ptr,
                               const uint8_t* b_ptr,
@@ -516,7 +505,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
   }
 
   if (use_iterative_conversion) {
-    SafeInitSharpYuv();
+    SharpYuvInit(VP8GetCPUInfo);
     if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
       return 0;
     }
diff --git a/thirdparty/libwebp/src/enc/vp8i_enc.h b/thirdparty/libwebp/src/enc/vp8i_enc.h
index 71f76702ae..c9927c47d8 100644
--- a/thirdparty/libwebp/src/enc/vp8i_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8i_enc.h
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 2
-#define ENC_REV_VERSION 4
+#define ENC_MIN_VERSION 3
+#define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
diff --git a/thirdparty/libwebp/src/enc/vp8l_enc.c b/thirdparty/libwebp/src/enc/vp8l_enc.c
index 2b345df610..0b07e529a9 100644
--- a/thirdparty/libwebp/src/enc/vp8l_enc.c
+++ b/thirdparty/libwebp/src/enc/vp8l_enc.c
@@ -361,10 +361,11 @@ typedef enum {
   kHistoTotal  // Must be last.
 } HistoIx;
 
-static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
-  const int green = p >> 8;  // The upper bits are masked away later.
-  ++r[((p >> 16) - green) & 0xff];
-  ++b[((p >>  0) - green) & 0xff];
+static void AddSingleSubGreen(uint32_t p,
+                              uint32_t* const r, uint32_t* const b) {
+  const int green = (int)p >> 8;  // The upper bits are masked away later.
+  ++r[(((int)p >> 16) - green) & 0xff];
+  ++b[(((int)p >>  0) - green) & 0xff];
 }
 
 static void AddSingle(uint32_t p,
@@ -1354,7 +1355,7 @@ static int EncodeImageInternal(
 static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
                                VP8LBitWriter* const bw) {
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
-  VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+  VP8LPutBits(bw, SUBTRACT_GREEN_TRANSFORM, 2);
   VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
 }
 
diff --git a/thirdparty/libwebp/src/mux/muxi.h b/thirdparty/libwebp/src/mux/muxi.h
index 0f4af1784d..7929138c44 100644
--- a/thirdparty/libwebp/src/mux/muxi.h
+++ b/thirdparty/libwebp/src/mux/muxi.h
@@ -28,8 +28,8 @@ extern "C" {
 // Defines and constants.
 
 #define MUX_MAJ_VERSION 1
-#define MUX_MIN_VERSION 2
-#define MUX_REV_VERSION 4
+#define MUX_MIN_VERSION 3
+#define MUX_REV_VERSION 0
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
diff --git a/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h
index 404b9a6d8c..24f3af7b54 100644
--- a/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h
+++ b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -148,9 +148,9 @@ int VP8GetSigned(VP8BitReader* WEBP_RESTRICT const br, int v,
     const range_t value = (range_t)(br->value_ >> pos);
     const int32_t mask = (int32_t)(split - value) >> 31;  // -1 or 0
     br->bits_ -= 1;
-    br->range_ += mask;
+    br->range_ += (range_t)mask;
     br->range_ |= 1;
-    br->value_ -= (bit_t)((split + 1) & mask) << pos;
+    br->value_ -= (bit_t)((split + 1) & (uint32_t)mask) << pos;
     BT_TRACK(br);
     return (v ^ mask) - mask;
   }
diff --git a/thirdparty/libwebp/src/utils/huffman_utils.c b/thirdparty/libwebp/src/utils/huffman_utils.c
index 0cba0fbb7d..90c2fbf7c1 100644
--- a/thirdparty/libwebp/src/utils/huffman_utils.c
+++ b/thirdparty/libwebp/src/utils/huffman_utils.c
@@ -142,7 +142,7 @@ static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
   {
     int step;              // step size to replicate values in current table
-    uint32_t low = -1;     // low bits for current root entry
+    uint32_t low = 0xffffffffu;        // low bits for current root entry
     uint32_t mask = total_size - 1;    // mask for low bits
     uint32_t key = 0;      // reversed prefix code
     int num_nodes = 1;     // number of Huffman tree nodes
diff --git a/thirdparty/libwebp/src/utils/utils.h b/thirdparty/libwebp/src/utils/utils.h
index ef04f108fe..c5ee873357 100644
--- a/thirdparty/libwebp/src/utils/utils.h
+++ b/thirdparty/libwebp/src/utils/utils.h
@@ -64,7 +64,8 @@ WEBP_EXTERN void WebPSafeFree(void* const ptr);
 // Alignment
 
 #define WEBP_ALIGN_CST 31
-#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
+#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & \
+                         ~(uintptr_t)WEBP_ALIGN_CST)
 
 #include <string.h>
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
@@ -73,10 +74,19 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
   memcpy(&A, ptr, sizeof(A));
   return A;
 }
+
+static WEBP_INLINE int32_t WebPMemToInt32(const uint8_t* const ptr) {
+  return (int32_t)WebPMemToUint32(ptr);
+}
+
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
   memcpy(ptr, &val, sizeof(val));
 }
 
+static WEBP_INLINE void WebPInt32ToMem(uint8_t* const ptr, int val) {
+  WebPUint32ToMem(ptr, (uint32_t)val);
+}
+
 //------------------------------------------------------------------------------
 // Reading/writing data.
 
diff --git a/thirdparty/libwebp/src/webp/format_constants.h b/thirdparty/libwebp/src/webp/format_constants.h
index eca6981a47..999035c5d2 100644
--- a/thirdparty/libwebp/src/webp/format_constants.h
+++ b/thirdparty/libwebp/src/webp/format_constants.h
@@ -55,7 +55,7 @@
 typedef enum {
   PREDICTOR_TRANSFORM      = 0,
   CROSS_COLOR_TRANSFORM    = 1,
-  SUBTRACT_GREEN           = 2,
+  SUBTRACT_GREEN_TRANSFORM = 2,
   COLOR_INDEXING_TRANSFORM = 3
 } VP8LImageTransformType;
 
diff --git a/thirdparty/libwebp/src/webp/types.h b/thirdparty/libwebp/src/webp/types.h
index 47f7f2b007..f255432e41 100644
--- a/thirdparty/libwebp/src/webp/types.h
+++ b/thirdparty/libwebp/src/webp/types.h
@@ -42,7 +42,11 @@ typedef long long int int64_t;
 # if defined(__GNUC__) && __GNUC__ >= 4
 #  define WEBP_EXTERN extern __attribute__ ((visibility ("default")))
 # else
-#  define WEBP_EXTERN extern
+#  if defined(_MSC_VER) && defined(WEBP_DLL)
+#   define WEBP_EXTERN __declspec(dllexport)
+#  else
+#   define WEBP_EXTERN extern
+#  endif
 # endif  /* __GNUC__ >= 4 */
 #endif  /* WEBP_EXTERN */
 
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
index 3c52415f62..b673c248b2 100644
--- a/thirdparty/meshoptimizer/LICENSE.md
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2016-2021 Arseny Kapoulkine
+Copyright (c) 2016-2022 Arseny Kapoulkine
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
index b1f7b359c1..c4672ad606 100644
--- a/thirdparty/meshoptimizer/clusterizer.cpp
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -283,6 +283,79 @@ static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int
 	return result;
 }
 
+static unsigned int getNeighborTriangle(const meshopt_Meshlet& meshlet, const Cone* meshlet_cone, unsigned int* meshlet_vertices, const unsigned int* indices, const TriangleAdjacency2& adjacency, const Cone* triangles, const unsigned int* live_triangles, const unsigned char* used, float meshlet_expected_radius, float cone_weight, unsigned int* out_extra)
+{
+	unsigned int best_triangle = ~0u;
+	unsigned int best_extra = 5;
+	float best_score = FLT_MAX;
+
+	for (size_t i = 0; i < meshlet.vertex_count; ++i)
+	{
+		unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+		unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+		size_t neighbors_size = adjacency.counts[index];
+
+		for (size_t j = 0; j < neighbors_size; ++j)
+		{
+			unsigned int triangle = neighbors[j];
+			unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+			unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+
+			// triangles that don't add new vertices to meshlets are max. priority
+			if (extra != 0)
+			{
+				// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+				if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+					extra = 0;
+
+				extra++;
+			}
+
+			// since topology-based priority is always more important than the score, we can skip scoring in some cases
+			if (extra > best_extra)
+				continue;
+
+			float score = 0;
+
+			// caller selects one of two scoring functions: geometrical (based on meshlet cone) or topological (based on remaining triangles)
+			if (meshlet_cone)
+			{
+				const Cone& tri_cone = triangles[triangle];
+
+				float distance2 =
+				    (tri_cone.px - meshlet_cone->px) * (tri_cone.px - meshlet_cone->px) +
+				    (tri_cone.py - meshlet_cone->py) * (tri_cone.py - meshlet_cone->py) +
+				    (tri_cone.pz - meshlet_cone->pz) * (tri_cone.pz - meshlet_cone->pz);
+
+				float spread = tri_cone.nx * meshlet_cone->nx + tri_cone.ny * meshlet_cone->ny + tri_cone.nz * meshlet_cone->nz;
+
+				score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
+			}
+			else
+			{
+				// each live_triangles entry is >= 1 since it includes the current triangle we're processing
+				score = float(live_triangles[a] + live_triangles[b] + live_triangles[c] - 3);
+			}
+
+			// note that topology-based priority is always more important than the score
+			// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
+			if (extra < best_extra || score < best_score)
+			{
+				best_triangle = triangle;
+				best_extra = extra;
+				best_score = score;
+			}
+		}
+	}
+
+	if (out_extra)
+		*out_extra = best_extra;
+
+	return best_triangle;
+}
+
 struct KDNode
 {
 	union
@@ -464,13 +537,15 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
 	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
 	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
 
+	assert(cone_weight >= 0 && cone_weight <= 1);
+
 	meshopt_Allocator allocator;
 
 	TriangleAdjacency2 adjacency = {};
@@ -511,65 +586,18 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 
 	for (;;)
 	{
-		unsigned int best_triangle = ~0u;
-		unsigned int best_extra = 5;
-		float best_score = FLT_MAX;
-
 		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
 
-		for (size_t i = 0; i < meshlet.vertex_count; ++i)
-		{
-			unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
-
-			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
-			size_t neighbours_size = adjacency.counts[index];
-
-			for (size_t j = 0; j < neighbours_size; ++j)
-			{
-				unsigned int triangle = neighbours[j];
-				assert(!emitted_flags[triangle]);
-
-				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
-				assert(a < vertex_count && b < vertex_count && c < vertex_count);
-
-				unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
-
-				// triangles that don't add new vertices to meshlets are max. priority
-				if (extra != 0)
-				{
-					// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
-					if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
-						extra = 0;
-
-					extra++;
-				}
-
-				// since topology-based priority is always more important than the score, we can skip scoring in some cases
-				if (extra > best_extra)
-					continue;
-
-				const Cone& tri_cone = triangles[triangle];
-
-				float distance2 =
-				    (tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) +
-				    (tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) +
-				    (tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz);
-
-				float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
+		unsigned int best_extra = 0;
+		unsigned int best_triangle = getNeighborTriangle(meshlet, &meshlet_cone, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, cone_weight, &best_extra);
 
-				float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
-
-				// note that topology-based priority is always more important than the score
-				// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
-				if (extra < best_extra || score < best_score)
-				{
-					best_triangle = triangle;
-					best_extra = extra;
-					best_score = score;
-				}
-			}
+		// if the best triangle doesn't fit into current meshlet, the spatial scoring we've used is not very meaningful, so we re-select using topological scoring
+		if (best_triangle != ~0u && (meshlet.vertex_count + best_extra > max_vertices || meshlet.triangle_count >= max_triangles))
+		{
+			best_triangle = getNeighborTriangle(meshlet, NULL, meshlet_vertices, indices, adjacency, triangles, live_triangles, used, meshlet_expected_radius, 0.f, NULL);
 		}
 
+		// when we run out of neighboring triangles we need to switch to spatial search; we currently just pick the closest triangle irrespective of connectivity
 		if (best_triangle == ~0u)
 		{
 			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
@@ -604,16 +632,16 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 		{
 			unsigned int index = indices[best_triangle * 3 + k];
 
-			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
-			size_t neighbours_size = adjacency.counts[index];
+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbors_size = adjacency.counts[index];
 
-			for (size_t i = 0; i < neighbours_size; ++i)
+			for (size_t i = 0; i < neighbors_size; ++i)
 			{
-				unsigned int tri = neighbours[i];
+				unsigned int tri = neighbors[i];
 
 				if (tri == best_triangle)
 				{
-					neighbours[i] = neighbours[neighbours_size - 1];
+					neighbors[i] = neighbors[neighbors_size - 1];
 					adjacency.counts[index]--;
 					break;
 				}
@@ -687,7 +715,7 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 
 	assert(index_count % 3 == 0);
 	assert(index_count / 3 <= kMeshletMaxTriangles);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	(void)vertex_count;
@@ -839,7 +867,7 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
 	using namespace meshopt;
 
 	assert(triangle_count <= kMeshletMaxTriangles);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	unsigned int indices[kMeshletMaxTriangles * 3];
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
index f60db0dc4f..cad808a2b1 100644
--- a/thirdparty/meshoptimizer/indexgenerator.cpp
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -187,7 +187,7 @@ size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int
 	using namespace meshopt;
 
 	assert(indices || index_count == vertex_count);
-	assert(index_count % 3 == 0);
+	assert(!indices || index_count % 3 == 0);
 	assert(vertex_size > 0 && vertex_size <= 256);
 
 	meshopt_Allocator allocator;
@@ -412,7 +412,7 @@ void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsig
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	meshopt_Allocator allocator;
@@ -483,7 +483,7 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	meshopt_Allocator allocator;
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
index 463fad29da..46d28d3ea3 100644
--- a/thirdparty/meshoptimizer/meshoptimizer.h
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.17
+ * meshoptimizer - version 0.18
  *
- * Copyright (C) 2016-2021, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2022, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
  *
  * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>
 
 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 170 /* 0.17 */
+#define MESHOPTIMIZER_VERSION 180 /* 0.18 */
 
 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -37,8 +37,8 @@ extern "C" {
 #endif
 
 /**
- * Vertex attribute stream, similar to glVertexPointer
- * Each element takes size bytes, with stride controlling the spacing between successive elements.
+ * Vertex attribute stream
+ * Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size).
  */
 struct meshopt_Stream
 {
@@ -115,7 +115,7 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* dest
  * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
  *
  * destination must contain enough space for the resulting index buffer (index_count*2 elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
 MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
@@ -131,7 +131,7 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
  * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
  *
  * destination must contain enough space for the resulting index buffer (index_count*4 elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
 MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
@@ -171,7 +171,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
  *
  * destination must contain enough space for the resulting index buffer (index_count elements)
  * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
  */
 MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
@@ -313,7 +313,21 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterQuat(void* destination, size
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_t count, size_t stride, int bits, const float* data);
 
 /**
- * Experimental: Mesh simplifier
+ * Simplification options
+ */
+enum
+{
+    /* Do not move vertices that are located on the topological border (vertices on triangle edges that don't have a paired triangle). Useful for simplifying portions of the larger mesh. */
+    meshopt_SimplifyLockBorder = 1 << 0,
+};
+
+/**
+ * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
+
+/**
+ * Mesh simplifier
  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
  * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
  * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
@@ -322,16 +336,12 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_
  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
  *
  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
- * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
+ * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
-
-/**
- * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex)
- */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
+MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
 
 /**
  * Experimental: Mesh simplifier (sloppy)
@@ -342,8 +352,8 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* d
  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
  *
  * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
- * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
+ * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation; value range [0..1]
  * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
  */
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
@@ -356,17 +366,17 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati
  * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
  *
  * destination must contain enough space for the target index buffer (target_vertex_count elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
 
 /**
- * Experimental: Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
+ * Returns the error scaling factor used by the simplifier to convert between absolute and relative extents
  *
  * Absolute error must be *divided* by the scaling factor before passing it to meshopt_simplify as target_error
  * Relative error returned by meshopt_simplify via result_error must be *multiplied* by the scaling factor to get absolute error.
  */
-MESHOPTIMIZER_EXPERIMENTAL float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_API float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
 /**
  * Mesh stripifier
@@ -418,7 +428,7 @@ struct meshopt_OverdrawStatistics
  * Returns overdraw statistics using a software rasterizer
  * Results may not match actual GPU performance
  *
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
 MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
@@ -456,7 +466,7 @@ struct meshopt_Meshlet
  * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
  * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
  * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
  * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
  */
@@ -498,7 +508,7 @@ struct meshopt_Bounds
  * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
  * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
  *
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
  */
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
@@ -518,7 +528,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destinati
  * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
  *
  * destination must contain enough space for the resulting index buffer (index_count elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
  */
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
@@ -610,7 +620,7 @@ inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_s
 template <typename T>
 inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
 template <typename T>
-inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = 0);
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = 0);
 template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = 0);
 template <typename T>
@@ -945,12 +955,12 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
 }
 
 template <typename T>
-inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error)
 {
 	meshopt_IndexAdapter<T> in(0, indices, index_count);
 	meshopt_IndexAdapter<T> out(destination, 0, index_count);
 
-	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, result_error);
+	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, options, result_error);
 }
 
 template <typename T>
@@ -1039,7 +1049,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif
 
 /**
- * Copyright (c) 2016-2021 Arseny Kapoulkine
+ * Copyright (c) 2016-2022 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/thirdparty/meshoptimizer/overdrawanalyzer.cpp b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
index 8d5859ba39..8b6f254134 100644
--- a/thirdparty/meshoptimizer/overdrawanalyzer.cpp
+++ b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
@@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	meshopt_Allocator allocator;
diff --git a/thirdparty/meshoptimizer/overdrawoptimizer.cpp b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
index 143656ed76..cc22dbcffc 100644
--- a/thirdparty/meshoptimizer/overdrawoptimizer.cpp
+++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
@@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	meshopt_Allocator allocator;
diff --git a/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch b/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch
index 21daac6eec..5cac985dc5 100644
--- a/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch
+++ b/thirdparty/meshoptimizer/patches/attribute-aware-simplify-distance-only-metric.patch
@@ -1,5 +1,5 @@
 diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
-index 5e92e2dc73..e40c141e76 100644
+index d8d4a67391..3847afc736 100644
 --- a/thirdparty/meshoptimizer/simplifier.cpp
 +++ b/thirdparty/meshoptimizer/simplifier.cpp
 @@ -20,7 +20,7 @@
@@ -11,7 +11,7 @@ index 5e92e2dc73..e40c141e76 100644
  
  // This work is based on:
  // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
-@@ -453,6 +453,7 @@ struct Collapse
+@@ -458,6 +458,7 @@ struct Collapse
  		float error;
  		unsigned int errorui;
  	};
@@ -19,7 +19,7 @@ index 5e92e2dc73..e40c141e76 100644
  };
  
  static float normalize(Vector3& v)
-@@ -533,6 +534,34 @@ static float quadricError(const Quadric& Q, const Vector3& v)
+@@ -538,6 +539,34 @@ static float quadricError(const Quadric& Q, const Vector3& v)
  	return fabsf(r) * s;
  }
  
@@ -54,7 +54,7 @@ index 5e92e2dc73..e40c141e76 100644
  static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
  {
  	float aw = a * w;
-@@ -688,7 +717,7 @@ static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3
+@@ -693,7 +722,7 @@ static void quadricUpdateAttributes(Quadric& Q, const Vector3& p0, const Vector3
  }
  #endif
  
@@ -63,7 +63,7 @@ index 5e92e2dc73..e40c141e76 100644
  {
  	for (size_t i = 0; i < index_count; i += 3)
  	{
-@@ -698,6 +727,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
+@@ -703,6 +732,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
  
  		Quadric Q;
  		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
@@ -73,7 +73,7 @@ index 5e92e2dc73..e40c141e76 100644
  
  #if ATTRIBUTES
  		quadricUpdateAttributes(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], Q.w);
-@@ -708,7 +740,7 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
+@@ -713,7 +745,7 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
  	}
  }
  
@@ -82,7 +82,7 @@ index 5e92e2dc73..e40c141e76 100644
  {
  	for (size_t i = 0; i < index_count; i += 3)
  	{
-@@ -752,6 +784,9 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
+@@ -757,6 +789,9 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
  
  			quadricAdd(vertex_quadrics[remap[i0]], Q);
  			quadricAdd(vertex_quadrics[remap[i1]], Q);
@@ -92,7 +92,7 @@ index 5e92e2dc73..e40c141e76 100644
  		}
  	}
  }
-@@ -856,7 +891,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices
+@@ -861,7 +896,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices
  	return collapse_count;
  }
  
@@ -101,7 +101,7 @@ index 5e92e2dc73..e40c141e76 100644
  {
  	for (size_t i = 0; i < collapse_count; ++i)
  	{
-@@ -876,10 +911,14 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
+@@ -881,10 +916,14 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
  		float ei = quadricError(qi, vertex_positions[i1]);
  		float ej = quadricError(qj, vertex_positions[j1]);
  
@@ -116,7 +116,7 @@ index 5e92e2dc73..e40c141e76 100644
  	}
  }
  
-@@ -976,7 +1015,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
+@@ -981,7 +1020,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
  	}
  }
  
@@ -125,7 +125,7 @@ index 5e92e2dc73..e40c141e76 100644
  {
  	size_t edge_collapses = 0;
  	size_t triangle_collapses = 0;
-@@ -1038,6 +1077,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
+@@ -1043,6 +1082,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
  		assert(collapse_remap[r1] == r1);
  
  		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
@@ -133,7 +133,7 @@ index 5e92e2dc73..e40c141e76 100644
  
  		if (vertex_kind[i0] == Kind_Complex)
  		{
-@@ -1075,7 +1115,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
+@@ -1080,7 +1120,7 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
  		triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
  		edge_collapses++;
  
@@ -142,7 +142,7 @@ index 5e92e2dc73..e40c141e76 100644
  	}
  
  #if TRACE
-@@ -1463,9 +1503,11 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
+@@ -1469,9 +1509,11 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
  
  	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
  	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
@@ -156,7 +156,7 @@ index 5e92e2dc73..e40c141e76 100644
  
  	if (result != indices)
  		memcpy(result, indices, index_count * sizeof(unsigned int));
-@@ -1496,7 +1538,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
+@@ -1502,7 +1544,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
  		if (edge_collapse_count == 0)
  			break;
  
@@ -165,7 +165,7 @@ index 5e92e2dc73..e40c141e76 100644
  
  #if TRACE > 1
  		dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind);
-@@ -1515,7 +1557,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
+@@ -1521,7 +1563,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
  		printf("pass %d: ", int(pass_count++));
  #endif
  
diff --git a/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch b/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch
index 33a17fe9fa..c065026a7d 100644
--- a/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch
+++ b/thirdparty/meshoptimizer/patches/attribute-aware-simplify.patch
@@ -1,21 +1,21 @@
 diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
-index be4b765d97..463fad29da 100644
+index d95725dd71..46d28d3ea3 100644
 --- a/thirdparty/meshoptimizer/meshoptimizer.h
 +++ b/thirdparty/meshoptimizer/meshoptimizer.h
-@@ -328,6 +328,11 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeFilterExp(void* destination, size_
-  */
- MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error);
+@@ -321,6 +321,11 @@ enum
+     meshopt_SimplifyLockBorder = 1 << 0,
+ };
  
 +/**
 + * Experimental: Mesh simplifier with attribute metric; attributes follow xyz position data atm (vertex data must contain 3 + attribute_count floats per vertex)
 + */
-+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
++MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error, const float* attributes, const float* attribute_weights, size_t attribute_count);
 +
  /**
-  * Experimental: Mesh simplifier (sloppy)
-  * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
+  * Mesh simplifier
+  * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
 diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
-index a74b08a97d..5e92e2dc73 100644
+index 5f0e9bac31..797329b010 100644
 --- a/thirdparty/meshoptimizer/simplifier.cpp
 +++ b/thirdparty/meshoptimizer/simplifier.cpp
 @@ -20,6 +20,8 @@
@@ -27,7 +27,7 @@ index a74b08a97d..5e92e2dc73 100644
  // This work is based on:
  // Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
  // Michael Garland. Quadric-based polygonal surface simplification. 1999
-@@ -371,6 +373,10 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
+@@ -376,6 +378,10 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
  struct Vector3
  {
  	float x, y, z;
@@ -38,7 +38,7 @@ index a74b08a97d..5e92e2dc73 100644
  };
  
  static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
-@@ -427,6 +433,13 @@ struct Quadric
+@@ -432,6 +438,13 @@ struct Quadric
  	float a10, a20, a21;
  	float b0, b1, b2, c;
  	float w;
@@ -52,7 +52,7 @@ index a74b08a97d..5e92e2dc73 100644
  };
  
  struct Collapse
-@@ -469,6 +482,16 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
+@@ -474,6 +487,16 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
  	Q.b2 += R.b2;
  	Q.c += R.c;
  	Q.w += R.w;
@@ -69,7 +69,7 @@ index a74b08a97d..5e92e2dc73 100644
  }
  
  static float quadricError(const Quadric& Q, const Vector3& v)
-@@ -494,6 +517,17 @@ static float quadricError(const Quadric& Q, const Vector3& v)
+@@ -499,6 +522,17 @@ static float quadricError(const Quadric& Q, const Vector3& v)
  	r += ry * v.y;
  	r += rz * v.z;
  
@@ -87,7 +87,7 @@ index a74b08a97d..5e92e2dc73 100644
  	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
  
  	return fabsf(r) * s;
-@@ -517,6 +551,13 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
+@@ -522,6 +556,13 @@ static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, flo
  	Q.b2 = c * dw;
  	Q.c = d * dw;
  	Q.w = w;
@@ -101,7 +101,7 @@ index a74b08a97d..5e92e2dc73 100644
  }
  
  static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
-@@ -569,6 +610,84 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3
+@@ -574,6 +615,84 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3
  	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
  }
  
@@ -186,7 +186,7 @@ index a74b08a97d..5e92e2dc73 100644
  static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
  {
  	for (size_t i = 0; i < index_count; i += 3)
-@@ -580,6 +699,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
+@@ -585,6 +704,9 @@ static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
  		Quadric Q;
  		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
  
@@ -196,29 +196,30 @@ index a74b08a97d..5e92e2dc73 100644
  		quadricAdd(vertex_quadrics[remap[i0]], Q);
  		quadricAdd(vertex_quadrics[remap[i1]], Q);
  		quadricAdd(vertex_quadrics[remap[i2]], Q);
-@@ -1273,13 +1395,19 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0;
+@@ -1278,14 +1400,20 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0;
  #endif
  
- size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
+ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
 +{
-+	return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, out_result_error, 0, 0, 0);
++	return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, options, out_result_error, 0, 0, 0);
 +}
 +
-+size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count)
++size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count)
  {
  	using namespace meshopt;
  
  	assert(index_count % 3 == 0);
--	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+-	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 -	assert(vertex_positions_stride % sizeof(float) == 0);
-+	assert(vertex_stride > 0 && vertex_stride <= 256);
++	assert(vertex_stride >= 12 && vertex_stride <= 256);
 +	assert(vertex_stride % sizeof(float) == 0);
  	assert(target_index_count <= index_count);
+ 	assert((options & ~(meshopt_SimplifyLockBorder)) == 0);
 +	assert(attribute_count <= ATTRIBUTES);
  
  	meshopt_Allocator allocator;
  
-@@ -1293,7 +1421,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+@@ -1299,7 +1427,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
  	// build position remap that maps each vertex to the one with identical position
  	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
  	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
@@ -227,7 +228,7 @@ index a74b08a97d..5e92e2dc73 100644
  
  	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
  	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
-@@ -1317,7 +1445,21 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+@@ -1323,7 +1451,21 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
  #endif
  
  	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
@@ -250,7 +251,7 @@ index a74b08a97d..5e92e2dc73 100644
  
  	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
  	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
-@@ -1409,7 +1551,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
+@@ -1415,7 +1557,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
  
  	// result_error is quadratic; we need to remap it back to linear
  	if (out_result_error)
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
index e40c141e76..391a77861a 100644
--- a/thirdparty/meshoptimizer/simplifier.cpp
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -254,7 +254,7 @@ static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int
 	return false;
 }
 
-static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge)
+static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge, unsigned int options)
 {
 	memset(loop, -1, vertex_count * sizeof(unsigned int));
 	memset(loopback, -1, vertex_count * sizeof(unsigned int));
@@ -364,6 +364,11 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 		}
 	}
 
+	if (options & meshopt_SimplifyLockBorder)
+		for (size_t i = 0; i < vertex_count; ++i)
+			if (result[i] == Kind_Border)
+				result[i] = Kind_Locked;
+
 #if TRACE
 	printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n",
 	    int(stats[0]), int(stats[1]), int(stats[2]), int(stats[3]));
@@ -1434,19 +1439,20 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = 0;
 MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0;
 #endif
 
-size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
+size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
 {
-	return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, out_result_error, 0, 0, 0);
+	return meshopt_simplifyWithAttributes(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, target_index_count, target_error, options, out_result_error, 0, 0, 0);
 }
 
-size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count)
+size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_data, size_t vertex_count, size_t vertex_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error, const float* attributes, const float* attribute_weights, size_t attribute_count)
 {
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_stride > 0 && vertex_stride <= 256);
+	assert(vertex_stride >= 12 && vertex_stride <= 256);
 	assert(vertex_stride % sizeof(float) == 0);
 	assert(target_index_count <= index_count);
+	assert((options & ~(meshopt_SimplifyLockBorder)) == 0);
 	assert(attribute_count <= ATTRIBUTES);
 
 	meshopt_Allocator allocator;
@@ -1467,7 +1473,7 @@ size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned
 	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
 	unsigned int* loop = allocator.allocate<unsigned int>(vertex_count);
 	unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count);
-	classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge);
+	classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge, options);
 
 #if TRACE
 	size_t unique_positions = 0;
@@ -1605,7 +1611,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_index_count <= index_count);
 
@@ -1736,7 +1742,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
 {
 	using namespace meshopt;
 
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_vertex_count <= vertex_count);
 
@@ -1848,7 +1854,7 @@ float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count,
 {
 	using namespace meshopt;
 
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	float extent = rescalePositions(NULL, vertex_positions, vertex_count, vertex_positions_stride);
diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp
index b09f80ac6f..7b1a069450 100644
--- a/thirdparty/meshoptimizer/spatialorder.cpp
+++ b/thirdparty/meshoptimizer/spatialorder.cpp
@@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
 {
 	using namespace meshopt;
 
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	meshopt_Allocator allocator;
@@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
 	(void)vertex_count;
diff --git a/thirdparty/meshoptimizer/vcacheoptimizer.cpp b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
index fb8ade4b77..ce8fd3a887 100644
--- a/thirdparty/meshoptimizer/vcacheoptimizer.cpp
+++ b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
@@ -110,7 +110,7 @@ static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned
 	return ~0u;
 }
 
-static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+static unsigned int getNextVertexNeighbor(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
 {
 	unsigned int best_candidate = ~0u;
 	int best_priority = -1;
@@ -281,16 +281,16 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = indices[current_triangle * 3 + k];
 
-			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
-			size_t neighbours_size = adjacency.counts[index];
+			unsigned int* neighbors = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbors_size = adjacency.counts[index];
 
-			for (size_t i = 0; i < neighbours_size; ++i)
+			for (size_t i = 0; i < neighbors_size; ++i)
 			{
-				unsigned int tri = neighbours[i];
+				unsigned int tri = neighbors[i];
 
 				if (tri == current_triangle)
 				{
-					neighbours[i] = neighbours[neighbours_size - 1];
+					neighbors[i] = neighbors[neighbors_size - 1];
 					adjacency.counts[index]--;
 					break;
 				}
@@ -314,10 +314,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 			vertex_scores[index] = score;
 
 			// update scores of vertex triangles
-			const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
-			const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
+			const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[index];
+			const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[index];
 
-			for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+			for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
 			{
 				unsigned int tri = *it;
 				assert(!emitted_flags[tri]);
@@ -412,11 +412,11 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
 	{
 		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
 
-		// emit all vertex neighbours
-		const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
-		const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
+		// emit all vertex neighbors
+		const unsigned int* neighbors_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+		const unsigned int* neighbors_end = neighbors_begin + adjacency.counts[current_vertex];
 
-		for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+		for (const unsigned int* it = neighbors_begin; it != neighbors_end; ++it)
 		{
 			unsigned int triangle = *it;
 
@@ -461,7 +461,7 @@ void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned i
 		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
 
 		// get next vertex
-		current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+		current_vertex = getNextVertexNeighbor(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
 
 		if (current_vertex == ~0u)
 		{
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
index 7925ea862c..4bd11121d2 100644
--- a/thirdparty/meshoptimizer/vertexcodec.cpp
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -50,6 +50,12 @@
 #define SIMD_TARGET
 #endif
 
+// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
+// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
+#define SIMD_LATENCYOPT
+#endif
+
 #endif // !MESHOPTIMIZER_NO_SIMD
 
 #ifdef SIMD_SSE
@@ -472,6 +478,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		typedef int unaligned_int;
 #endif
 
+#ifdef SIMD_LATENCYOPT
+		unsigned int data32;
+		memcpy(&data32, data, 4);
+		data32 &= data32 >> 1;
+
+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
 
@@ -490,11 +508,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
 
+#ifdef SIMD_LATENCYOPT
+		return data + 4 + datacnt;
+#else
 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}
 
 	case 2:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned long long data64;
+		memcpy(&data64, data, 8);
+		data64 &= data64 >> 1;
+		data64 &= data64 >> 2;
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
 
@@ -512,7 +544,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
 
+#ifdef SIMD_LATENCYOPT
+		return data + 8 + datacnt;
+#else
 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}
 
 	case 3:
@@ -604,24 +640,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
 
 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
 {
-	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
-
-	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
-	uint8x16_t masked = vandq_u8(mask, byte_mask);
+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
+	const uint64_t magic = 0x000103070f1f3f80ull;
 
-#ifdef __aarch64__
-	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
-	mask0 = vaddv_u8(vget_low_u8(masked));
-	mask1 = vaddv_u8(vget_high_u8(masked));
-#else
-	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
-	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
-	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
-	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
+	uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
 
-	mask0 = vget_lane_u8(sum3, 0);
-	mask1 = vget_lane_u8(sum3, 1);
-#endif
+	mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
+	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
 }
 
 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
@@ -639,6 +664,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
 	case 1:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned int data32;
+		memcpy(&data32, data, 4);
+		data32 &= data32 >> 1;
+
+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		uint8x8_t sel2 = vld1_u8(data);
 		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
 		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
@@ -655,11 +692,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
 		vst1q_u8(buffer, result);
 
+#ifdef SIMD_LATENCYOPT
+		return data + 4 + datacnt;
+#else
 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}
 
 	case 2:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned long long data64;
+		memcpy(&data64, data, 8);
+		data64 &= data64 >> 1;
+		data64 &= data64 >> 2;
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		uint8x8_t sel4 = vld1_u8(data);
 		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
 		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
@@ -675,7 +726,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 
 		vst1q_u8(buffer, result);
 
+#ifdef SIMD_LATENCYOPT
+		return data + 8 + datacnt;
+#else
 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}
 
 	case 3:
@@ -715,7 +770,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;
 
-	// TODO: This can use v8x16_bitmask in the future
 	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
 	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
 }
diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp
index 606a280aa9..14a73b1ddd 100644
--- a/thirdparty/meshoptimizer/vertexfilter.cpp
+++ b/thirdparty/meshoptimizer/vertexfilter.cpp
@@ -931,7 +931,7 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 		const float* v = &data[i * stride_float];
 		unsigned int* d = &destination[i * stride_float];
 
-		// use maximum exponent to encode values; this guarantess that mantissa is [-1, 1]
+		// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
 		int exp = -100;
 
 		for (size_t j = 0; j < stride_float; ++j)