292 files changed, 39863 insertions, 6382 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 1573fab7b6..62690e21c7 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -62,8 +62,8 @@ Use UI font if exists, because it has tight vertical metrics and good for UI.
 ### Hack Regular
 
 - Upstream: https://github.com/source-foundry/Hack
-- Version: 2.020
-- License: Hack Open Font License v2.0
+- Version: 3.000
+- License: MIT + Bitstream Vera License
 
 ### DroidSans*.ttf
 
@@ -179,11 +179,14 @@ Files extracted from upstream source:
 
 TODO.
 
+Important: File `libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c` has
+Godot-made change marked with `// -- GODOT --` comments.
+
 
 ## libwebp
 
 - Upstream: https://chromium.googlesource.com/webm/libwebp/
-- Version: 0.6.0
+- Version: 0.6.1
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
@@ -266,6 +269,19 @@ Collection of single-file libraries used in Godot components.
   * Version: ?
   * License: BSD
 
+### poshlib
+
+- Upstream: http://poshlib.hookatooka.com/poshlib/trac.cgi (username guest, password guest123)
+- Version: 1.3.002
+- License: MIT
+
+Files extracted from the upstream source:
+
+- posh.c
+- posh.h
+
+(no license file was included with the upstream distribution)
+
 ### scene
 
 - `mikktspace.{c,h}`
@@ -281,6 +297,20 @@ Collection of single-file libraries used in Godot components.
   * Version: 1.11
   * License: Public Domain (Unlicense) or MIT
 
+## thekla_atlas
+
+- Upstream: https://github.com/Thekla/thekla_atlas
+- Version: 80a1430 (git)
+- License: zlib
+
+Files extracted from the upstream source:
+
+- Relevant sources from src/
+- License.txt
+
+Important: Some files have Godot-made changes, those
+changes are marked with `// -- GODOT --` comments.
+
 
 ## nanosvg
 
@@ -335,6 +365,7 @@ Files extracted from upstream source:
 - celt/ and silk/ subfolders
 - COPYING
 
+
 ## pcre2
 
 - Upstream: http://www.pcre.org/
@@ -348,6 +379,7 @@ Files extracted from upstream source:
 - src/pcre2_jit_*.c and src/sljit/*
 - AUTHORS and COPYING
 
+
 ## pvrtccompressor
 
 - Upstream: https://bitbucket.org/jthlim/pvrtccompressor
@@ -359,12 +391,14 @@ Files extracted from upstream source:
 - all .cpp and .h files apart from `main.cpp`
 - LICENSE.TXT
 
+
 ## recastnavigation
 
 - Upstream: https://github.com/recastnavigation/recastnavigation
 - version: git commit ef3ea40f - 2016-02-06
 - License: zlib
 
+
 ## rtaudio
 
 - Upstream: http://www.music.mcgill.ca/~gary/rtaudio/
@@ -386,21 +420,21 @@ Files extracted from upstream source:
 
 - all .cpp, .h and .inl files
 
+Important: Some files have Godot-made changes.
+They are marked with `// -- GODOT start --` and `// -- GODOT end --`
+comments and a patch is provided in the squish/ folder.
+
 
 ## tinyexr
 
 - Upstream: https://github.com/syoyo/tinyexr
-- Version: 0.9.5+ (git a145d69)
+- Version: 0.9.5+ (git 9f784ca - 24 October 2017)
 - License: BSD-3-Clause
 
 Files extracted from upstream source:
 
 - `tinyexr.{cc,h}`
 
-Important: Some changes were made to get TinyEXR to build on the ancient
-MinGW-w64 toolchain of Travis CI.
-https://github.com/godotengine/godot/commit/37f5e1dcd94611dd5b670f013abf0323e8b47def
-
 
 ## zlib
 
diff --git a/thirdparty/fonts/Hack_Regular.ttf b/thirdparty/fonts/Hack_Regular.ttf
index a35ea2e4f4..f342700811 100644
--- a/thirdparty/fonts/Hack_Regular.ttf
+++ b/thirdparty/fonts/Hack_Regular.ttf
diff --git a/thirdparty/fonts/LICENSE_Hack.md b/thirdparty/fonts/LICENSE_Hack.md
index e9fc8a1f87..ddd23a2b81 100644
--- a/thirdparty/fonts/LICENSE_Hack.md
+++ b/thirdparty/fonts/LICENSE_Hack.md
@@ -1,49 +1,30 @@
-## License
+The work in the Hack project is Copyright 2017 Source Foundry Authors and licensed under the MIT License
 
-Hack Copyright 2015, Christopher Simpkins with Reserved Font Name "Hack".
+The work in the DejaVu project was committed to the public domain.
 
 Bitstream Vera Sans Mono Copyright 2003 Bitstream Inc. and licensed under the Bitstream Vera License with Reserved Font Names "Bitstream" and "Vera"
 
-DejaVu modifications of the original Bitstream Vera Sans Mono typeface have been committed to the public domain.
+### MIT License
 
+Copyright (c) 2017 Source Foundry Authors
 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-This Font Software is licensed under the Hack Open Font License v2.0 and the Bitstream Vera License.
-
-These licenses are copied below.
-
-
-### Hack Open Font License v2.0
-
-(Version 1.0 - 06 September 2015)
-
-(Version 2.0 - 27 September 2015)
-
-Copyright 2015 by Christopher Simpkins. All Rights Reserved.
-
-DEFINITIONS
-
-"Author" refers to any designer, engineer, programmer, technical writer or other person who contributed to the Font Software.
-
-PERMISSION AND CONDITIONS
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of the fonts accompanying this license ("Fonts") and associated source code, documentation, and binary files (the "Font Software"), to reproduce and distribute the modifications to the Bitstream Vera Font Software, including without limitation the rights to use, study, copy, merge, embed, modify, redistribute, and/or sell modified or unmodified copies of the Font Software, and to permit persons to whom the Font Software is furnished to do so, subject to the following conditions:
-
-(1) The above copyright notice and this permission notice shall be included in all modified and unmodified copies of the Font Software typefaces. These notices can be included either as stand-alone text files, human-readable headers or in the appropriate machine-readable metadata fields within text or binary files as long as those fields can be easily viewed by the user.
-
-(2) The Font Software may be modified, altered, or added to, and in particular the designs of glyphs or characters in the Fonts may be modified and additional glyphs or characters may be added to the Fonts, only if the fonts are renamed to names not containing the word "Hack".
-
-(3) Neither the Font Software nor any of its individual components, in original or modified versions, may be sold by itself.
-
-TERMINATION
-
-This license becomes null and void if any of the above conditions are not met.
-
-THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM OTHER DEALINGS IN THE FONT SOFTWARE.
-
-Except as contained in this notice, the names of Christopher Simpkins and the Author(s) of the Font Software shall not be used to promote, endorse or advertise any modified version, except to acknowledge the contribution(s) of Christopher Simpkins and the Author(s) or with their explicit written permission.  For further information, contact: chris at sourcefoundry dot org.
-
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
 
 ### BITSTREAM VERA LICENSE
 
diff --git a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index b718678537..d8a92354c9 100644
--- a/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/thirdparty/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -40,11 +40,12 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
 };
 
 #if defined(__clang__)
+// -- GODOT start -
 # if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
-    (defined(__APPLE__) && \
+    (!defined(__MACPORTS__) && defined(__APPLE__) && \
         ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
             (__clang_major__ == 5 && __clang_minor__ == 0)))
-
+// -- GODOT end --
 #  define MM256_BROADCASTSI128_SI256(x) \
        _mm_broadcastsi128_si256((__m128i const *)&(x))
 # else  // clang > 3.3, and not 5.0 on macosx.
diff --git a/thirdparty/libwebp/dsp/argb.c b/thirdparty/libwebp/dsp/argb.c
deleted file mode 100644
index cc1f9a96c3..0000000000
--- a/thirdparty/libwebp/dsp/argb.c
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions.
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int i;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-  }
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int i, offset = 0;
-  for (i = 0; i < len; ++i) {
-    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
-    offset += step;
-  }
-}
-
-void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                    const uint8_t*, int, uint32_t*);
-void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
-                   int, int, uint32_t*);
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-extern void VP8EncDspARGBInitSSE2(void);
-
-static volatile VP8CPUInfo argb_last_cpuinfo_used =
-    (VP8CPUInfo)&argb_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
-  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-
-  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8EncDspARGBInitSSE2();
-    }
-#endif
-#if defined(WEBP_USE_MIPS_DSP_R2)
-    if (VP8GetCPUInfo(kMIPSdspR2)) {
-      VP8EncDspARGBInitMIPSdspR2();
-    }
-#endif
-  }
-  argb_last_cpuinfo_used = VP8GetCPUInfo;
-}
diff --git a/thirdparty/libwebp/dsp/argb_mips_dsp_r2.c b/thirdparty/libwebp/dsp/argb_mips_dsp_r2.c
deleted file mode 100644
index af65acb8ff..0000000000
--- a/thirdparty/libwebp/dsp/argb_mips_dsp_r2.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (mips version).
-//
-// Author: Djordje Pesut (djordje.pesut@imgtec.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_MIPS_DSP_R2)
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  int temp0, temp1, temp2, temp3, offset;
-  const int rest = len & 1;
-  const uint32_t* const loop_end = out + len - rest;
-  const int step = 4;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[a])         \n\t"
-    "lbux         %[temp1],    %[offset](%[r])         \n\t"
-    "lbux         %[temp2],    %[offset](%[g])         \n\t"
-    "lbux         %[temp3],    %[offset](%[b])         \n\t"
-    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
-    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                    int len, int step, uint32_t* out) {
-  int temp0, temp1, temp2, offset;
-  const int rest = len & 1;
-  const int a = 0xff;
-  const uint32_t* const loop_end = out + len - rest;
-  __asm__ volatile (
-    "xor          %[offset],   %[offset], %[offset]    \n\t"
-    "beq          %[loop_end], %[out],    0f           \n\t"
-  "2:                                                  \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "addiu        %[out],      %[out],    4            \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    -4(%[out])              \n\t"
-    "addu         %[offset],   %[offset], %[step]      \n\t"
-    "bne          %[loop_end], %[out],    2b           \n\t"
-  "0:                                                  \n\t"
-    "beq          %[rest],     $zero,     1f           \n\t"
-    "lbux         %[temp0],    %[offset](%[r])         \n\t"
-    "lbux         %[temp1],    %[offset](%[g])         \n\t"
-    "lbux         %[temp2],    %[offset](%[b])         \n\t"
-    "ins          %[temp0],    %[a],      16,     16   \n\t"
-    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
-    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
-    "sw           %[temp0],    0(%[out])               \n\t"
-  "1:                                                  \n\t"
-    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
-      [offset]"=&r"(offset), [out]"+&r"(out)
-    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
-      [loop_end]"r"(loop_end), [rest]"r"(rest)
-    : "memory"
-  );
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitMIPSdspR2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
-  VP8PackARGB = PackARGB;
-  VP8PackRGB = PackRGB;
-}
-
-#else  // !WEBP_USE_MIPS_DSP_R2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
-
-#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/argb_sse2.c b/thirdparty/libwebp/dsp/argb_sse2.c
deleted file mode 100644
index afcb1957e7..0000000000
--- a/thirdparty/libwebp/dsp/argb_sse2.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//   ARGB making functions (SSE2 version).
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include "./dsp.h"
-
-#if defined(WEBP_USE_SSE2)
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <string.h>
-
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
-static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
-                     const uint8_t* b, int len, uint32_t* out) {
-  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
-    int i = 0;
-    const int len_max = len & ~3;  // max length processed in main loop
-    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
-    assert(b == r + 2);
-    assert(a == r + 3);
-    for (; i < len_max; i += 4) {
-      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
-      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
-      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
-      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
-      const __m128i F = _mm_or_si128(E, C);
-      _mm_storeu_si128((__m128i*)(out + i), F);
-    }
-    for (; i < len; ++i) {
-      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
-    }
-  } else {
-    assert(g == b + 1);
-    assert(r == b + 2);
-    assert(a == b + 3);
-    memcpy(out, b, len * 4);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Entry point
-
-extern void VP8EncDspARGBInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
-  VP8PackARGB = PackARGB;
-}
-
-#else  // !WEBP_USE_SSE2
-
-WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
-
-#endif  // WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/enc/backward_references_enc.c b/thirdparty/libwebp/enc/backward_references_enc.c
deleted file mode 100644
index 7c0559ff1e..0000000000
--- a/thirdparty/libwebp/enc/backward_references_enc.c
+++ /dev/null
@@ -1,1800 +0,0 @@
-// Copyright 2012 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Jyrki Alakuijala (jyrki@google.com)
-//
-
-#include <assert.h>
-#include <math.h>
-
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../dsp/dsp.h"
-#include "../utils/color_cache_utils.h"
-#include "../utils/utils.h"
-
-#define VALUES_IN_BYTE 256
-
-#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
-
-#define MAX_ENTROPY    (1e30f)
-
-// 1M window (4M bytes) minus 120 special codes for short distances.
-#define WINDOW_SIZE_BITS 20
-#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
-
-// Minimum number of pixels for which it is cheaper to encode a
-// distance + length instead of each pixel as a literal.
-#define MIN_LENGTH 4
-// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
-// is used in VP8LHashChain.
-#define MAX_LENGTH_BITS 12
-// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
-#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
-#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
-#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
-#endif
-
-// -----------------------------------------------------------------------------
-
-static const uint8_t plane_to_code_lut[128] = {
- 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
- 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
- 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
- 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
- 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
- 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
- 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
- 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
-};
-
-static int DistanceToPlaneCode(int xsize, int dist) {
-  const int yoffset = dist / xsize;
-  const int xoffset = dist - yoffset * xsize;
-  if (xoffset <= 8 && yoffset < 8) {
-    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
-  } else if (xoffset > xsize - 8 && yoffset < 7) {
-    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
-  }
-  return dist + 120;
-}
-
-// Returns the exact index where array1 and array2 are different. For an index
-// inferior or equal to best_len_match, the return value just has to be strictly
-// inferior to best_len_match. The current behavior is to return 0 if this index
-// is best_len_match, and the index itself otherwise.
-// If no two elements are the same, it returns max_limit.
-static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
-                                       const uint32_t* const array2,
-                                       int best_len_match, int max_limit) {
-  // Before 'expensive' linear match, check if the two arrays match at the
-  // current best length index.
-  if (array1[best_len_match] != array2[best_len_match]) return 0;
-
-  return VP8LVectorMismatch(array1, array2, max_limit);
-}
-
-// -----------------------------------------------------------------------------
-//  VP8LBackwardRefs
-
-struct PixOrCopyBlock {
-  PixOrCopyBlock* next_;   // next block (or NULL)
-  PixOrCopy* start_;       // data start
-  int size_;               // currently used size
-};
-
-static void ClearBackwardRefs(VP8LBackwardRefs* const refs) {
-  assert(refs != NULL);
-  if (refs->tail_ != NULL) {
-    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
-  }
-  refs->free_blocks_ = refs->refs_;
-  refs->tail_ = &refs->refs_;
-  refs->last_block_ = NULL;
-  refs->refs_ = NULL;
-}
-
-void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
-  assert(refs != NULL);
-  ClearBackwardRefs(refs);
-  while (refs->free_blocks_ != NULL) {
-    PixOrCopyBlock* const next = refs->free_blocks_->next_;
-    WebPSafeFree(refs->free_blocks_);
-    refs->free_blocks_ = next;
-  }
-}
-
-void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
-  assert(refs != NULL);
-  memset(refs, 0, sizeof(*refs));
-  refs->tail_ = &refs->refs_;
-  refs->block_size_ =
-      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
-}
-
-VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
-  VP8LRefsCursor c;
-  c.cur_block_ = refs->refs_;
-  if (refs->refs_ != NULL) {
-    c.cur_pos = c.cur_block_->start_;
-    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
-  } else {
-    c.cur_pos = NULL;
-    c.last_pos_ = NULL;
-  }
-  return c;
-}
-
-void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
-  PixOrCopyBlock* const b = c->cur_block_->next_;
-  c->cur_pos = (b == NULL) ? NULL : b->start_;
-  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
-  c->cur_block_ = b;
-}
-
-// Create a new block, either from the free list or allocated
-static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
-  PixOrCopyBlock* b = refs->free_blocks_;
-  if (b == NULL) {   // allocate new memory chunk
-    const size_t total_size =
-        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
-    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
-    if (b == NULL) {
-      refs->error_ |= 1;
-      return NULL;
-    }
-    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
-  } else {  // recycle from free-list
-    refs->free_blocks_ = b->next_;
-  }
-  *refs->tail_ = b;
-  refs->tail_ = &b->next_;
-  refs->last_block_ = b;
-  b->next_ = NULL;
-  b->size_ = 0;
-  return b;
-}
-
-static WEBP_INLINE void BackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
-                                              const PixOrCopy v) {
-  PixOrCopyBlock* b = refs->last_block_;
-  if (b == NULL || b->size_ == refs->block_size_) {
-    b = BackwardRefsNewBlock(refs);
-    if (b == NULL) return;   // refs->error_ is set
-  }
-  b->start_[b->size_++] = v;
-}
-
-int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
-                         VP8LBackwardRefs* const dst) {
-  const PixOrCopyBlock* b = src->refs_;
-  ClearBackwardRefs(dst);
-  assert(src->block_size_ == dst->block_size_);
-  while (b != NULL) {
-    PixOrCopyBlock* const new_b = BackwardRefsNewBlock(dst);
-    if (new_b == NULL) return 0;   // dst->error_ is set
-    memcpy(new_b->start_, b->start_, b->size_ * sizeof(*b->start_));
-    new_b->size_ = b->size_;
-    b = b->next_;
-  }
-  return 1;
-}
-
-// -----------------------------------------------------------------------------
-// Hash chains
-
-int VP8LHashChainInit(VP8LHashChain* const p, int size) {
-  assert(p->size_ == 0);
-  assert(p->offset_length_ == NULL);
-  assert(size > 0);
-  p->offset_length_ =
-      (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
-  if (p->offset_length_ == NULL) return 0;
-  p->size_ = size;
-
-  return 1;
-}
-
-void VP8LHashChainClear(VP8LHashChain* const p) {
-  assert(p != NULL);
-  WebPSafeFree(p->offset_length_);
-
-  p->size_ = 0;
-  p->offset_length_ = NULL;
-}
-
-// -----------------------------------------------------------------------------
-
-#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
-#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
-
-static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
-  uint32_t key;
-  key  = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
-  key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
-  key = key >> (32 - HASH_BITS);
-  return key;
-}
-
-// Returns the maximum number of hash chain lookups to do for a
-// given compression quality. Return value in range [8, 86].
-static int GetMaxItersForQuality(int quality) {
-  return 8 + (quality * quality) / 128;
-}
-
-static int GetWindowSizeForHashChain(int quality, int xsize) {
-  const int max_window_size = (quality > 75) ? WINDOW_SIZE
-                            : (quality > 50) ? (xsize << 8)
-                            : (quality > 25) ? (xsize << 6)
-                            : (xsize << 4);
-  assert(xsize > 0);
-  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
-}
-
-static WEBP_INLINE int MaxFindCopyLength(int len) {
-  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
-}
-
-int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize,
-                      int low_effort) {
-  const int size = xsize * ysize;
-  const int iter_max = GetMaxItersForQuality(quality);
-  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
-  int pos;
-  int argb_comp;
-  uint32_t base_position;
-  int32_t* hash_to_first_index;
-  // Temporarily use the p->offset_length_ as a hash chain.
-  int32_t* chain = (int32_t*)p->offset_length_;
-  assert(size > 0);
-  assert(p->size_ != 0);
-  assert(p->offset_length_ != NULL);
-
-  if (size <= 2) {
-    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
-    return 1;
-  }
-
-  hash_to_first_index =
-      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
-  if (hash_to_first_index == NULL) return 0;
-
-  // Set the int32_t array to -1.
-  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
-  // Fill the chain linking pixels with the same hash.
-  argb_comp = (argb[0] == argb[1]);
-  for (pos = 0; pos < size - 2;) {
-    uint32_t hash_code;
-    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
-    if (argb_comp && argb_comp_next) {
-      // Consecutive pixels with the same color will share the same hash.
-      // We therefore use a different hash: the color and its repetition
-      // length.
-      uint32_t tmp[2];
-      uint32_t len = 1;
-      tmp[0] = argb[pos];
-      // Figure out how far the pixels are the same.
-      // The last pixel has a different 64 bit hash, as its next pixel does
-      // not have the same color, so we just need to get to the last pixel equal
-      // to its follower.
-      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
-        ++len;
-      }
-      if (len > MAX_LENGTH) {
-        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
-        // because they are linked to their predecessor and we automatically
-        // check that in the main for loop below. Skipping means setting no
-        // predecessor in the chain, hence -1.
-        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
-        pos += len - MAX_LENGTH;
-        len = MAX_LENGTH;
-      }
-      // Process the rest of the hash chain.
-      while (len) {
-        tmp[1] = len--;
-        hash_code = GetPixPairHash64(tmp);
-        chain[pos] = hash_to_first_index[hash_code];
-        hash_to_first_index[hash_code] = pos++;
-      }
-      argb_comp = 0;
-    } else {
-      // Just move one pixel forward.
-      hash_code = GetPixPairHash64(argb + pos);
-      chain[pos] = hash_to_first_index[hash_code];
-      hash_to_first_index[hash_code] = pos++;
-      argb_comp = argb_comp_next;
-    }
-  }
-  // Process the penultimate pixel.
-  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
-
-  WebPSafeFree(hash_to_first_index);
-
-  // Find the best match interval at each pixel, defined by an offset to the
-  // pixel and a length. The right-most pixel cannot match anything to the right
-  // (hence a best length of 0) and the left-most pixel nothing to the left
-  // (hence an offset of 0).
-  assert(size > 2);
-  p->offset_length_[0] = p->offset_length_[size - 1] = 0;
-  for (base_position = size - 2; base_position > 0;) {
-    const int max_len = MaxFindCopyLength(size - 1 - base_position);
-    const uint32_t* const argb_start = argb + base_position;
-    int iter = iter_max;
-    int best_length = 0;
-    uint32_t best_distance = 0;
-    uint32_t best_argb;
-    const int min_pos =
-        (base_position > window_size) ? base_position - window_size : 0;
-    const int length_max = (max_len < 256) ? max_len : 256;
-    uint32_t max_base_position;
-
-    pos = chain[base_position];
-    if (!low_effort) {
-      int curr_length;
-      // Heuristic: use the comparison with the above line as an initialization.
-      if (base_position >= (uint32_t)xsize) {
-        curr_length = FindMatchLength(argb_start - xsize, argb_start,
-                                      best_length, max_len);
-        if (curr_length > best_length) {
-          best_length = curr_length;
-          best_distance = xsize;
-        }
-        --iter;
-      }
-      // Heuristic: compare to the previous pixel.
-      curr_length =
-          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
-      if (curr_length > best_length) {
-        best_length = curr_length;
-        best_distance = 1;
-      }
-      --iter;
-      // Skip the for loop if we already have the maximum.
-      if (best_length == MAX_LENGTH) pos = min_pos - 1;
-    }
-    best_argb = argb_start[best_length];
-
-    for (; pos >= min_pos && --iter; pos = chain[pos]) {
-      int curr_length;
-      assert(base_position > (uint32_t)pos);
-
-      if (argb[pos + best_length] != best_argb) continue;
-
-      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
-      if (best_length < curr_length) {
-        best_length = curr_length;
-        best_distance = base_position - pos;
-        best_argb = argb_start[best_length];
-        // Stop if we have reached a good enough length.
-        if (best_length >= length_max) break;
-      }
-    }
-    // We have the best match but in case the two intervals continue matching
-    // to the left, we have the best matches for the left-extended pixels.
-    max_base_position = base_position;
-    while (1) {
-      assert(best_length <= MAX_LENGTH);
-      assert(best_distance <= WINDOW_SIZE);
-      p->offset_length_[base_position] =
-          (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
-      --base_position;
-      // Stop if we don't have a match or if we are out of bounds.
-      if (best_distance == 0 || base_position == 0) break;
-      // Stop if we cannot extend the matching intervals to the left.
-      if (base_position < best_distance ||
-          argb[base_position - best_distance] != argb[base_position]) {
-        break;
-      }
-      // Stop if we are matching at its limit because there could be a closer
-      // matching interval with the same maximum length. Then again, if the
-      // matching interval is as close as possible (best_distance == 1), we will
-      // never find anything better so let's continue.
-      if (best_length == MAX_LENGTH && best_distance != 1 &&
-          base_position + MAX_LENGTH < max_base_position) {
-        break;
-      }
-      if (best_length < MAX_LENGTH) {
-        ++best_length;
-        max_base_position = base_position;
-      }
-    }
-  }
-  return 1;
-}
-
-static WEBP_INLINE int HashChainFindOffset(const VP8LHashChain* const p,
-                                           const int base_position) {
-  return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
-}
-
-static WEBP_INLINE int HashChainFindLength(const VP8LHashChain* const p,
-                                           const int base_position) {
-  return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
-}
-
-static WEBP_INLINE void HashChainFindCopy(const VP8LHashChain* const p,
-                                          int base_position,
-                                          int* const offset_ptr,
-                                          int* const length_ptr) {
-  *offset_ptr = HashChainFindOffset(p, base_position);
-  *length_ptr = HashChainFindLength(p, base_position);
-}
-
-static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
-                                         VP8LColorCache* const hashers,
-                                         VP8LBackwardRefs* const refs) {
-  PixOrCopy v;
-  if (use_color_cache) {
-    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
-    if (VP8LColorCacheLookup(hashers, key) == pixel) {
-      v = PixOrCopyCreateCacheIdx(key);
-    } else {
-      v = PixOrCopyCreateLiteral(pixel);
-      VP8LColorCacheSet(hashers, key, pixel);
-    }
-  } else {
-    v = PixOrCopyCreateLiteral(pixel);
-  }
-  BackwardRefsCursorAdd(refs, v);
-}
-
-static int BackwardReferencesRle(int xsize, int ysize,
-                                 const uint32_t* const argb,
-                                 int cache_bits, VP8LBackwardRefs* const refs) {
-  const int pix_count = xsize * ysize;
-  int i, k;
-  const int use_color_cache = (cache_bits > 0);
-  VP8LColorCache hashers;
-
-  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
-    return 0;
-  }
-  ClearBackwardRefs(refs);
-  // Add first pixel as literal.
-  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
-  i = 1;
-  while (i < pix_count) {
-    const int max_len = MaxFindCopyLength(pix_count - i);
-    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
-    const int prev_row_len = (i < xsize) ? 0 :
-        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
-    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
-      // We don't need to update the color cache here since it is always the
-      // same pixel being copied, and that does not change the color cache
-      // state.
-      i += rle_len;
-    } else if (prev_row_len >= MIN_LENGTH) {
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
-      if (use_color_cache) {
-        for (k = 0; k < prev_row_len; ++k) {
-          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        }
-      }
-      i += prev_row_len;
-    } else {
-      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-      i++;
-    }
-  }
-  if (use_color_cache) VP8LColorCacheClear(&hashers);
-  return !refs->error_;
-}
-
-static int BackwardReferencesLz77(int xsize, int ysize,
-                                  const uint32_t* const argb, int cache_bits,
-                                  const VP8LHashChain* const hash_chain,
-                                  VP8LBackwardRefs* const refs) {
-  int i;
-  int i_last_check = -1;
-  int ok = 0;
-  int cc_init = 0;
-  const int use_color_cache = (cache_bits > 0);
-  const int pix_count = xsize * ysize;
-  VP8LColorCache hashers;
-
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-  ClearBackwardRefs(refs);
-  for (i = 0; i < pix_count;) {
-    // Alternative#1: Code the pixels starting at 'i' using backward reference.
-    int offset = 0;
-    int len = 0;
-    int j;
-    HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len >= MIN_LENGTH) {
-      const int len_ini = len;
-      int max_reach = 0;
-      assert(i + len < pix_count);
-      // Only start from what we have not checked already.
-      i_last_check = (i > i_last_check) ? i : i_last_check;
-      // We know the best match for the current pixel but we try to find the
-      // best matches for the current pixel AND the next one combined.
-      // The naive method would use the intervals:
-      // [i,i+len) + [i+len, length of best match at i+len)
-      // while we check if we can use:
-      // [i,j) (where j<=i+len) + [j, length of best match at j)
-      for (j = i_last_check + 1; j <= i + len_ini; ++j) {
-        const int len_j = HashChainFindLength(hash_chain, j);
-        const int reach =
-            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
-        if (reach > max_reach) {
-          len = j - i;
-          max_reach = reach;
-        }
-      }
-    } else {
-      len = 1;
-    }
-    // Go with literal or backward reference.
-    assert(len > 0);
-    if (len == 1) {
-      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
-    } else {
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
-      if (use_color_cache) {
-        for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
-      }
-    }
-    i += len;
-  }
-
-  ok = !refs->error_;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  return ok;
-}
-
-// -----------------------------------------------------------------------------
-
-typedef struct {
-  double alpha_[VALUES_IN_BYTE];
-  double red_[VALUES_IN_BYTE];
-  double blue_[VALUES_IN_BYTE];
-  double distance_[NUM_DISTANCE_CODES];
-  double* literal_;
-} CostModel;
-
-static int BackwardReferencesTraceBackwards(
-    int xsize, int ysize, const uint32_t* const argb, int quality,
-    int cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs);
-
-static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const uint32_t population_counts[], double output[]) {
-  uint32_t sum = 0;
-  int nonzeros = 0;
-  int i;
-  for (i = 0; i < num_symbols; ++i) {
-    sum += population_counts[i];
-    if (population_counts[i] > 0) {
-      ++nonzeros;
-    }
-  }
-  if (nonzeros <= 1) {
-    memset(output, 0, num_symbols * sizeof(*output));
-  } else {
-    const double logsum = VP8LFastLog2(sum);
-    for (i = 0; i < num_symbols; ++i) {
-      output[i] = logsum - VP8LFastLog2(population_counts[i]);
-    }
-  }
-}
-
-static int CostModelBuild(CostModel* const m, int cache_bits,
-                          VP8LBackwardRefs* const refs) {
-  int ok = 0;
-  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
-  if (histo == NULL) goto Error;
-
-  VP8LHistogramCreate(histo, refs, cache_bits);
-
-  ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(histo->palette_code_bits_),
-      histo->literal_, m->literal_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->red_, m->red_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->blue_, m->blue_);
-  ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
-  ConvertPopulationCountTableToBitEstimates(
-      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
-  ok = 1;
-
- Error:
-  VP8LFreeHistogram(histo);
-  return ok;
-}
-
-static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
-  return m->alpha_[v >> 24] +
-         m->red_[(v >> 16) & 0xff] +
-         m->literal_[(v >> 8) & 0xff] +
-         m->blue_[v & 0xff];
-}
-
-static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
-  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
-  return m->literal_[literal_idx];
-}
-
-static WEBP_INLINE double GetLengthCost(const CostModel* const m,
-                                        uint32_t length) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(length, &code, &extra_bits);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
-}
-
-static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
-                                          uint32_t distance) {
-  int code, extra_bits;
-  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
-  return m->distance_[code] + extra_bits;
-}
-
-static void AddSingleLiteralWithCostModel(const uint32_t* const argb,
-                                          VP8LColorCache* const hashers,
-                                          const CostModel* const cost_model,
-                                          int idx, int use_color_cache,
-                                          double prev_cost, float* const cost,
-                                          uint16_t* const dist_array) {
-  double cost_val = prev_cost;
-  const uint32_t color = argb[0];
-  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
-  if (ix >= 0) {
-    // use_color_cache is true and hashers contains color
-    const double mul0 = 0.68;
-    cost_val += GetCacheCost(cost_model, ix) * mul0;
-  } else {
-    const double mul1 = 0.82;
-    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
-    cost_val += GetLiteralCost(cost_model, color) * mul1;
-  }
-  if (cost[idx] > cost_val) {
-    cost[idx] = (float)cost_val;
-    dist_array[idx] = 1;  // only one is inserted.
-  }
-}
-
-// -----------------------------------------------------------------------------
-// CostManager and interval handling
-
-// Empirical value to avoid high memory consumption but good for performance.
-#define COST_CACHE_INTERVAL_SIZE_MAX 100
-
-// To perform backward reference every pixel at index index_ is considered and
-// the cost for the MAX_LENGTH following pixels computed. Those following pixels
-// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
-//     distance_cost_ at index_ + GetLengthCost(cost_model, k)
-//            (named cost)            (named cached cost)
-// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
-// array of size MAX_LENGTH.
-// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
-// minimal values using intervals, for which lower_ and upper_ bounds are kept.
-// An interval is defined by the index_ of the pixel that generated it and
-// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
-// it contains the minimum value for pixels between start_ and end_.
-// Intervals are stored in a linked list and ordered by start_. When a new
-// interval has a better minimum, old intervals are split or removed.
-typedef struct CostInterval CostInterval;
-struct CostInterval {
-  double lower_;
-  double upper_;
-  int start_;
-  int end_;
-  double distance_cost_;
-  int index_;
-  CostInterval* previous_;
-  CostInterval* next_;
-};
-
-// The GetLengthCost(cost_model, k) part of the costs is also bounded for
-// efficiency in a set of intervals of a different type.
-// If those intervals are small enough, they are not used for comparison and
-// written into the costs right away.
-typedef struct {
-  double lower_;  // Lower bound of the interval.
-  double upper_;  // Upper bound of the interval.
-  int start_;
-  int end_;       // Exclusive.
-  int do_write_;  // If !=0, the interval is saved to cost instead of being kept
-                  // for comparison.
-} CostCacheInterval;
-
-// This structure is in charge of managing intervals and costs.
-// It caches the different CostCacheInterval, caches the different
-// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
-// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
-#define COST_MANAGER_MAX_FREE_LIST 10
-typedef struct {
-  CostInterval* head_;
-  int count_;  // The number of stored intervals.
-  CostCacheInterval* cache_intervals_;
-  size_t cache_intervals_size_;
-  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
-  double min_cost_cache_;          // The minimum value in cost_cache_[1:].
-  double max_cost_cache_;          // The maximum value in cost_cache_[1:].
-  float* costs_;
-  uint16_t* dist_array_;
-  // Most of the time, we only need few intervals -> use a free-list, to avoid
-  // fragmentation with small allocs in most common cases.
-  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
-  CostInterval* free_intervals_;
-  // These are regularly malloc'd remains. This list can't grow larger than than
-  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
-  CostInterval* recycled_intervals_;
-  // Buffer used in BackwardReferencesHashChainDistanceOnly to store the ends
-  // of the intervals that can have impacted the cost at a pixel.
-  int* interval_ends_;
-  int interval_ends_size_;
-} CostManager;
-
-static int IsCostCacheIntervalWritable(int start, int end) {
-  // 100 is the length for which we consider an interval for comparison, and not
-  // for writing.
-  // The first intervals are very small and go in increasing size. This constant
-  // helps merging them into one big interval (up to index 150/200 usually from
-  // which intervals start getting much bigger).
-  // This value is empirical.
-  return (end - start + 1 < 100);
-}
-
-static void CostIntervalAddToFreeList(CostManager* const manager,
-                                      CostInterval* const interval) {
-  interval->next_ = manager->free_intervals_;
-  manager->free_intervals_ = interval;
-}
-
-static int CostIntervalIsInFreeList(const CostManager* const manager,
-                                    const CostInterval* const interval) {
-  return (interval >= &manager->intervals_[0] &&
-          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
-}
-
-static void CostManagerInitFreeList(CostManager* const manager) {
-  int i;
-  manager->free_intervals_ = NULL;
-  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
-    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
-  }
-}
-
-static void DeleteIntervalList(CostManager* const manager,
-                               const CostInterval* interval) {
-  while (interval != NULL) {
-    const CostInterval* const next = interval->next_;
-    if (!CostIntervalIsInFreeList(manager, interval)) {
-      WebPSafeFree((void*)interval);
-    }  // else: do nothing
-    interval = next;
-  }
-}
-
-static void CostManagerClear(CostManager* const manager) {
-  if (manager == NULL) return;
-
-  WebPSafeFree(manager->costs_);
-  WebPSafeFree(manager->cache_intervals_);
-  WebPSafeFree(manager->interval_ends_);
-
-  // Clear the interval lists.
-  DeleteIntervalList(manager, manager->head_);
-  manager->head_ = NULL;
-  DeleteIntervalList(manager, manager->recycled_intervals_);
-  manager->recycled_intervals_ = NULL;
-
-  // Reset pointers, count_ and cache_intervals_size_.
-  memset(manager, 0, sizeof(*manager));
-  CostManagerInitFreeList(manager);
-}
-
-static int CostManagerInit(CostManager* const manager,
-                           uint16_t* const dist_array, int pix_count,
-                           const CostModel* const cost_model) {
-  int i;
-  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
-  // This constant is tied to the cost_model we use.
-  // Empirically, differences between intervals is usually of more than 1.
-  const double min_cost_diff = 0.1;
-
-  manager->costs_ = NULL;
-  manager->cache_intervals_ = NULL;
-  manager->interval_ends_ = NULL;
-  manager->head_ = NULL;
-  manager->recycled_intervals_ = NULL;
-  manager->count_ = 0;
-  manager->dist_array_ = dist_array;
-  CostManagerInitFreeList(manager);
-
-  // Fill in the cost_cache_.
-  manager->cache_intervals_size_ = 1;
-  manager->cost_cache_[0] = 0;
-  for (i = 1; i < cost_cache_size; ++i) {
-    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
-    // Get an approximation of the number of bound intervals.
-    if (fabs(manager->cost_cache_[i] - manager->cost_cache_[i - 1]) >
-        min_cost_diff) {
-      ++manager->cache_intervals_size_;
-    }
-    // Compute the minimum of cost_cache_.
-    if (i == 1) {
-      manager->min_cost_cache_ = manager->cost_cache_[1];
-      manager->max_cost_cache_ = manager->cost_cache_[1];
-    } else if (manager->cost_cache_[i] < manager->min_cost_cache_) {
-      manager->min_cost_cache_ = manager->cost_cache_[i];
-    } else if (manager->cost_cache_[i] > manager->max_cost_cache_) {
-      manager->max_cost_cache_ = manager->cost_cache_[i];
-    }
-  }
-
-  // With the current cost models, we have 15 intervals, so we are safe by
-  // setting a maximum of COST_CACHE_INTERVAL_SIZE_MAX.
-  if (manager->cache_intervals_size_ > COST_CACHE_INTERVAL_SIZE_MAX) {
-    manager->cache_intervals_size_ = COST_CACHE_INTERVAL_SIZE_MAX;
-  }
-  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
-      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
-  if (manager->cache_intervals_ == NULL) {
-    CostManagerClear(manager);
-    return 0;
-  }
-
-  // Fill in the cache_intervals_.
-  {
-    double cost_prev = -1e38f;  // unprobably low initial value
-    CostCacheInterval* prev = NULL;
-    CostCacheInterval* cur = manager->cache_intervals_;
-    const CostCacheInterval* const end =
-        manager->cache_intervals_ + manager->cache_intervals_size_;
-
-    // Consecutive values in cost_cache_ are compared and if a big enough
-    // difference is found, a new interval is created and bounded.
-    for (i = 0; i < cost_cache_size; ++i) {
-      const double cost_val = manager->cost_cache_[i];
-      if (i == 0 ||
-          (fabs(cost_val - cost_prev) > min_cost_diff && cur + 1 < end)) {
-        if (i > 1) {
-          const int is_writable =
-              IsCostCacheIntervalWritable(cur->start_, cur->end_);
-          // Merge with the previous interval if both are writable.
-          if (is_writable && cur != manager->cache_intervals_ &&
-              prev->do_write_) {
-            // Update the previous interval.
-            prev->end_ = cur->end_;
-            if (cur->lower_ < prev->lower_) {
-              prev->lower_ = cur->lower_;
-            } else if (cur->upper_ > prev->upper_) {
-              prev->upper_ = cur->upper_;
-            }
-          } else {
-            cur->do_write_ = is_writable;
-            prev = cur;
-            ++cur;
-          }
-        }
-        // Initialize an interval.
-        cur->start_ = i;
-        cur->do_write_ = 0;
-        cur->lower_ = cost_val;
-        cur->upper_ = cost_val;
-      } else {
-        // Update the current interval bounds.
-        if (cost_val < cur->lower_) {
-          cur->lower_ = cost_val;
-        } else if (cost_val > cur->upper_) {
-          cur->upper_ = cost_val;
-        }
-      }
-      cur->end_ = i + 1;
-      cost_prev = cost_val;
-    }
-    manager->cache_intervals_size_ = cur + 1 - manager->cache_intervals_;
-  }
-
-  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
-  if (manager->costs_ == NULL) {
-    CostManagerClear(manager);
-    return 0;
-  }
-  // Set the initial costs_ high for every pixel as we will keep the minimum.
-  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
-
-  // The cost at pixel is influenced by the cost intervals from previous pixels.
-  // Let us take the specific case where the offset is the same (which actually
-  // happens a lot in case of uniform regions).
-  // pixel i contributes to j>i a cost of: offset cost + cost_cache_[j-i]
-  // pixel i+1 contributes to j>i a cost of: 2*offset cost + cost_cache_[j-i-1]
-  // pixel i+2 contributes to j>i a cost of: 3*offset cost + cost_cache_[j-i-2]
-  // and so on.
-  // A pixel i influences the following length(j) < MAX_LENGTH pixels. What is
-  // the value of j such that pixel i + j cannot influence any of those pixels?
-  // This value is such that:
-  //               max of cost_cache_ < j*offset cost + min of cost_cache_
-  // (pixel i + j 's cost cannot beat the worst cost given by pixel i).
-  // This value will be used to optimize the cost computation in
-  // BackwardReferencesHashChainDistanceOnly.
-  {
-    // The offset cost is computed in GetDistanceCost and has a minimum value of
-    // the minimum in cost_model->distance_. The case where the offset cost is 0
-    // will be dealt with differently later so we are only interested in the
-    // minimum non-zero offset cost.
-    double offset_cost_min = 0.;
-    int size;
-    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-      if (cost_model->distance_[i] != 0) {
-        if (offset_cost_min == 0.) {
-          offset_cost_min = cost_model->distance_[i];
-        } else if (cost_model->distance_[i] < offset_cost_min) {
-          offset_cost_min = cost_model->distance_[i];
-        }
-      }
-    }
-    // In case all the cost_model->distance_ is 0, the next non-zero cost we
-    // can have is from the extra bit in GetDistanceCost, hence 1.
-    if (offset_cost_min < 1.) offset_cost_min = 1.;
-
-    size = 1 + (int)ceil((manager->max_cost_cache_ - manager->min_cost_cache_) /
-                         offset_cost_min);
-    // Empirically, we usually end up with a value below 100.
-    if (size > MAX_LENGTH) size = MAX_LENGTH;
-
-    manager->interval_ends_ =
-        (int*)WebPSafeMalloc(size, sizeof(*manager->interval_ends_));
-    if (manager->interval_ends_ == NULL) {
-      CostManagerClear(manager);
-      return 0;
-    }
-    manager->interval_ends_size_ = size;
-  }
-
-  return 1;
-}
-
-// Given the distance_cost for pixel 'index', update the cost at pixel 'i' if it
-// is smaller than the previously computed value.
-static WEBP_INLINE void UpdateCost(CostManager* const manager, int i, int index,
-                                   double distance_cost) {
-  int k = i - index;
-  double cost_tmp;
-  assert(k >= 0 && k < MAX_LENGTH);
-  cost_tmp = distance_cost + manager->cost_cache_[k];
-
-  if (manager->costs_[i] > cost_tmp) {
-    manager->costs_[i] = (float)cost_tmp;
-    manager->dist_array_[i] = k + 1;
-  }
-}
-
-// Given the distance_cost for pixel 'index', update the cost for all the pixels
-// between 'start' and 'end' excluded.
-static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
-                                              int start, int end, int index,
-                                              double distance_cost) {
-  int i;
-  for (i = start; i < end; ++i) UpdateCost(manager, i, index, distance_cost);
-}
-
-// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
-static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
-                                         CostInterval* const prev,
-                                         CostInterval* const next) {
-  if (prev != NULL) {
-    prev->next_ = next;
-  } else {
-    manager->head_ = next;
-  }
-
-  if (next != NULL) next->previous_ = prev;
-}
-
-// Pop an interval in the manager.
-static WEBP_INLINE void PopInterval(CostManager* const manager,
-                                    CostInterval* const interval) {
-  CostInterval* const next = interval->next_;
-
-  if (interval == NULL) return;
-
-  ConnectIntervals(manager, interval->previous_, next);
-  if (CostIntervalIsInFreeList(manager, interval)) {
-    CostIntervalAddToFreeList(manager, interval);
-  } else {  // recycle regularly malloc'd intervals too
-    interval->next_ = manager->recycled_intervals_;
-    manager->recycled_intervals_ = interval;
-  }
-  --manager->count_;
-  assert(manager->count_ >= 0);
-}
-
-// Update the cost at index i by going over all the stored intervals that
-// overlap with i.
-static WEBP_INLINE void UpdateCostPerIndex(CostManager* const manager, int i) {
-  CostInterval* current = manager->head_;
-
-  while (current != NULL && current->start_ <= i) {
-    if (current->end_ <= i) {
-      // We have an outdated interval, remove it.
-      CostInterval* next = current->next_;
-      PopInterval(manager, current);
-      current = next;
-    } else {
-      UpdateCost(manager, i, current->index_, current->distance_cost_);
-      current = current->next_;
-    }
-  }
-}
-
-// Given a current orphan interval and its previous interval, before
-// it was orphaned (which can be NULL), set it at the right place in the list
-// of intervals using the start_ ordering and the previous interval as a hint.
-static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
-                                               CostInterval* const current,
-                                               CostInterval* previous) {
-  assert(current != NULL);
-
-  if (previous == NULL) previous = manager->head_;
-  while (previous != NULL && current->start_ < previous->start_) {
-    previous = previous->previous_;
-  }
-  while (previous != NULL && previous->next_ != NULL &&
-         previous->next_->start_ < current->start_) {
-    previous = previous->next_;
-  }
-
-  if (previous != NULL) {
-    ConnectIntervals(manager, current, previous->next_);
-  } else {
-    ConnectIntervals(manager, current, manager->head_);
-  }
-  ConnectIntervals(manager, previous, current);
-}
-
-// Insert an interval in the list contained in the manager by starting at
-// interval_in as a hint. The intervals are sorted by start_ value.
-static WEBP_INLINE void InsertInterval(CostManager* const manager,
-                                       CostInterval* const interval_in,
-                                       double distance_cost, double lower,
-                                       double upper, int index, int start,
-                                       int end) {
-  CostInterval* interval_new;
-
-  if (IsCostCacheIntervalWritable(start, end) ||
-      manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
-    // Write down the interval if it is too small.
-    UpdateCostPerInterval(manager, start, end, index, distance_cost);
-    return;
-  }
-  if (manager->free_intervals_ != NULL) {
-    interval_new = manager->free_intervals_;
-    manager->free_intervals_ = interval_new->next_;
-  } else if (manager->recycled_intervals_ != NULL) {
-    interval_new = manager->recycled_intervals_;
-    manager->recycled_intervals_ = interval_new->next_;
-  } else {   // malloc for good
-    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
-    if (interval_new == NULL) {
-      // Write down the interval if we cannot create it.
-      UpdateCostPerInterval(manager, start, end, index, distance_cost);
-      return;
-    }
-  }
-
-  interval_new->distance_cost_ = distance_cost;
-  interval_new->lower_ = lower;
-  interval_new->upper_ = upper;
-  interval_new->index_ = index;
-  interval_new->start_ = start;
-  interval_new->end_ = end;
-  PositionOrphanInterval(manager, interval_new, interval_in);
-
-  ++manager->count_;
-}
-
-// When an interval has its start_ or end_ modified, it needs to be
-// repositioned in the linked list.
-static WEBP_INLINE void RepositionInterval(CostManager* const manager,
-                                           CostInterval* const interval) {
-  if (IsCostCacheIntervalWritable(interval->start_, interval->end_)) {
-    // Maybe interval has been resized and is small enough to be removed.
-    UpdateCostPerInterval(manager, interval->start_, interval->end_,
-                          interval->index_, interval->distance_cost_);
-    PopInterval(manager, interval);
-    return;
-  }
-
-  // Early exit if interval is at the right spot.
-  if ((interval->previous_ == NULL ||
-       interval->previous_->start_ <= interval->start_) &&
-      (interval->next_ == NULL ||
-       interval->start_ <= interval->next_->start_)) {
-    return;
-  }
-
-  ConnectIntervals(manager, interval->previous_, interval->next_);
-  PositionOrphanInterval(manager, interval, interval->previous_);
-}
-
-// Given a new cost interval defined by its start at index, its last value and
-// distance_cost, add its contributions to the previous intervals and costs.
-// If handling the interval or one of its subintervals becomes to heavy, its
-// contribution is added to the costs right away.
-static WEBP_INLINE void PushInterval(CostManager* const manager,
-                                     double distance_cost, int index,
-                                     int last) {
-  size_t i;
-  CostInterval* interval = manager->head_;
-  CostInterval* interval_next;
-  const CostCacheInterval* const cost_cache_intervals =
-      manager->cache_intervals_;
-
-  for (i = 0; i < manager->cache_intervals_size_ &&
-              cost_cache_intervals[i].start_ < last;
-       ++i) {
-    // Define the intersection of the ith interval with the new one.
-    int start = index + cost_cache_intervals[i].start_;
-    const int end = index + (cost_cache_intervals[i].end_ > last
-                                 ? last
-                                 : cost_cache_intervals[i].end_);
-    const double lower_in = cost_cache_intervals[i].lower_;
-    const double upper_in = cost_cache_intervals[i].upper_;
-    const double lower_full_in = distance_cost + lower_in;
-    const double upper_full_in = distance_cost + upper_in;
-
-    if (cost_cache_intervals[i].do_write_) {
-      UpdateCostPerInterval(manager, start, end, index, distance_cost);
-      continue;
-    }
-
-    for (; interval != NULL && interval->start_ < end && start < end;
-         interval = interval_next) {
-      const double lower_full_interval =
-          interval->distance_cost_ + interval->lower_;
-      const double upper_full_interval =
-          interval->distance_cost_ + interval->upper_;
-
-      interval_next = interval->next_;
-
-      // Make sure we have some overlap
-      if (start >= interval->end_) continue;
-
-      if (lower_full_in >= upper_full_interval) {
-        // When intervals are represented, the lower, the better.
-        // [**********************************************************]
-        // start                                                    end
-        //                   [----------------------------------]
-        //                   interval->start_       interval->end_
-        // If we are worse than what we already have, add whatever we have so
-        // far up to interval.
-        const int start_new = interval->end_;
-        InsertInterval(manager, interval, distance_cost, lower_in, upper_in,
-                       index, start, interval->start_);
-        start = start_new;
-        continue;
-      }
-
-      // We know the two intervals intersect.
-      if (upper_full_in >= lower_full_interval) {
-        // There is no clear cut on which is best, so let's keep both.
-        // [*********[*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*]***********]
-        // start     interval->start_     interval->end_         end
-        // OR
-        // [*********[*-*-*-*-*-*-*-*-*-*-*-]----------------------]
-        // start     interval->start_     end          interval->end_
-        const int end_new = (interval->end_ <= end) ? interval->end_ : end;
-        InsertInterval(manager, interval, distance_cost, lower_in, upper_in,
-                       index, start, end_new);
-        start = end_new;
-      } else if (start <= interval->start_ && interval->end_ <= end) {
-        //                   [----------------------------------]
-        //                   interval->start_       interval->end_
-        // [**************************************************************]
-        // start                                                        end
-        // We can safely remove the old interval as it is fully included.
-        PopInterval(manager, interval);
-      } else {
-        if (interval->start_ <= start && end <= interval->end_) {
-          // [--------------------------------------------------------------]
-          // interval->start_                                  interval->end_
-          //                     [*****************************]
-          //                     start                       end
-          // We have to split the old interval as it fully contains the new one.
-          const int end_original = interval->end_;
-          interval->end_ = start;
-          InsertInterval(manager, interval, interval->distance_cost_,
-                         interval->lower_, interval->upper_, interval->index_,
-                         end, end_original);
-        } else if (interval->start_ < start) {
-          // [------------------------------------]
-          // interval->start_        interval->end_
-          //                     [*****************************]
-          //                     start                       end
-          interval->end_ = start;
-        } else {
-          //              [------------------------------------]
-          //              interval->start_        interval->end_
-          // [*****************************]
-          // start                       end
-          interval->start_ = end;
-        }
-
-        // The interval has been modified, we need to reposition it or write it.
-        RepositionInterval(manager, interval);
-      }
-    }
-    // Insert the remaining interval from start to end.
-    InsertInterval(manager, interval, distance_cost, lower_in, upper_in, index,
-                   start, end);
-  }
-}
-
-static int BackwardReferencesHashChainDistanceOnly(
-    int xsize, int ysize, const uint32_t* const argb, int quality,
-    int cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs, uint16_t* const dist_array) {
-  int i;
-  int ok = 0;
-  int cc_init = 0;
-  const int pix_count = xsize * ysize;
-  const int use_color_cache = (cache_bits > 0);
-  const size_t literal_array_size = sizeof(double) *
-      (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
-       ((cache_bits > 0) ? (1 << cache_bits) : 0));
-  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
-  CostModel* const cost_model =
-      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
-  VP8LColorCache hashers;
-  const int skip_length = 32 + quality;
-  const int skip_min_distance_code = 2;
-  CostManager* cost_manager =
-      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
-
-  if (cost_model == NULL || cost_manager == NULL) goto Error;
-
-  cost_model->literal_ = (double*)(cost_model + 1);
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-
-  if (!CostModelBuild(cost_model, cache_bits, refs)) {
-    goto Error;
-  }
-
-  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
-    goto Error;
-  }
-
-  // We loop one pixel at a time, but store all currently best points to
-  // non-processed locations from this point.
-  dist_array[0] = 0;
-  // Add first pixel as literal.
-  AddSingleLiteralWithCostModel(argb + 0, &hashers, cost_model, 0,
-                                use_color_cache, 0.0, cost_manager->costs_,
-                                dist_array);
-
-  for (i = 1; i < pix_count - 1; ++i) {
-    int offset = 0, len = 0;
-    double prev_cost = cost_manager->costs_[i - 1];
-    HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len >= 2) {
-      // If we are dealing with a non-literal.
-      const int code = DistanceToPlaneCode(xsize, offset);
-      const double offset_cost = GetDistanceCost(cost_model, code);
-      const int first_i = i;
-      int j_max = 0, interval_ends_index = 0;
-      const int is_offset_zero = (offset_cost == 0.);
-
-      if (!is_offset_zero) {
-        j_max = (int)ceil(
-            (cost_manager->max_cost_cache_ - cost_manager->min_cost_cache_) /
-            offset_cost);
-        if (j_max < 1) {
-          j_max = 1;
-        } else if (j_max > cost_manager->interval_ends_size_ - 1) {
-          // This could only happen in the case of MAX_LENGTH.
-          j_max = cost_manager->interval_ends_size_ - 1;
-        }
-      }  // else j_max is unused anyway.
-
-      // Instead of considering all contributions from a pixel i by calling:
-      //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
-      // we optimize these contributions in case offset_cost stays the same for
-      // consecutive pixels. This describes a set of pixels similar to a
-      // previous set (e.g. constant color regions).
-      for (; i < pix_count - 1; ++i) {
-        int offset_next, len_next;
-        prev_cost = cost_manager->costs_[i - 1];
-
-        if (is_offset_zero) {
-          // No optimization can be made so we just push all of the
-          // contributions from i.
-          PushInterval(cost_manager, prev_cost, i, len);
-        } else {
-          // j_max is chosen as the smallest j such that:
-          //       max of cost_cache_ < j*offset cost + min of cost_cache_
-          // Therefore, the pixel influenced by i-j_max, cannot be influenced
-          // by i. Only the costs after the end of what i contributed need to be
-          // updated. cost_manager->interval_ends_ is a circular buffer that
-          // stores those ends.
-          const double distance_cost = prev_cost + offset_cost;
-          int j = cost_manager->interval_ends_[interval_ends_index];
-          if (i - first_i <= j_max ||
-              !IsCostCacheIntervalWritable(j, i + len)) {
-            PushInterval(cost_manager, distance_cost, i, len);
-          } else {
-            for (; j < i + len; ++j) {
-              UpdateCost(cost_manager, j, i, distance_cost);
-            }
-          }
-          // Store the new end in the circular buffer.
-          assert(interval_ends_index < cost_manager->interval_ends_size_);
-          cost_manager->interval_ends_[interval_ends_index] = i + len;
-          if (++interval_ends_index > j_max) interval_ends_index = 0;
-        }
-
-        // Check whether i is the last pixel to consider, as it is handled
-        // differently.
-        if (i + 1 >= pix_count - 1) break;
-        HashChainFindCopy(hash_chain, i + 1, &offset_next, &len_next);
-        if (offset_next != offset) break;
-        len = len_next;
-        UpdateCostPerIndex(cost_manager, i);
-        AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i,
-                                      use_color_cache, prev_cost,
-                                      cost_manager->costs_, dist_array);
-      }
-      // Submit the last pixel.
-      UpdateCostPerIndex(cost_manager, i + 1);
-
-      // This if is for speedup only. It roughly doubles the speed, and
-      // makes compression worse by .1 %.
-      if (len >= skip_length && code <= skip_min_distance_code) {
-        // Long copy for short distances, let's skip the middle
-        // lookups for better copies.
-        // 1) insert the hashes.
-        if (use_color_cache) {
-          int k;
-          for (k = 0; k < len; ++k) {
-            VP8LColorCacheInsert(&hashers, argb[i + k]);
-          }
-        }
-        // 2) jump.
-        {
-          const int i_next = i + len - 1;  // for loop does ++i, thus -1 here.
-          for (; i <= i_next; ++i) UpdateCostPerIndex(cost_manager, i + 1);
-          i = i_next;
-        }
-        goto next_symbol;
-      }
-      if (len > 2) {
-        // Also try the smallest interval possible (size 2).
-        double cost_total =
-            prev_cost + offset_cost + GetLengthCost(cost_model, 1);
-        if (cost_manager->costs_[i + 1] > cost_total) {
-          cost_manager->costs_[i + 1] = (float)cost_total;
-          dist_array[i + 1] = 2;
-        }
-      }
-    } else {
-      // The pixel is added as a single literal so just update the costs.
-      UpdateCostPerIndex(cost_manager, i + 1);
-    }
-
-    AddSingleLiteralWithCostModel(argb + i, &hashers, cost_model, i,
-                                  use_color_cache, prev_cost,
-                                  cost_manager->costs_, dist_array);
-
- next_symbol: ;
-  }
-  // Handle the last pixel.
-  if (i == (pix_count - 1)) {
-    AddSingleLiteralWithCostModel(
-        argb + i, &hashers, cost_model, i, use_color_cache,
-        cost_manager->costs_[pix_count - 2], cost_manager->costs_, dist_array);
-  }
-
-  ok = !refs->error_;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  CostManagerClear(cost_manager);
-  WebPSafeFree(cost_model);
-  WebPSafeFree(cost_manager);
-  return ok;
-}
-
-// We pack the path at the end of *dist_array and return
-// a pointer to this part of the array. Example:
-// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
-static void TraceBackwards(uint16_t* const dist_array,
-                           int dist_array_size,
-                           uint16_t** const chosen_path,
-                           int* const chosen_path_size) {
-  uint16_t* path = dist_array + dist_array_size;
-  uint16_t* cur = dist_array + dist_array_size - 1;
-  while (cur >= dist_array) {
-    const int k = *cur;
-    --path;
-    *path = k;
-    cur -= k;
-  }
-  *chosen_path = path;
-  *chosen_path_size = (int)(dist_array + dist_array_size - path);
-}
-
-static int BackwardReferencesHashChainFollowChosenPath(
-    const uint32_t* const argb, int cache_bits,
-    const uint16_t* const chosen_path, int chosen_path_size,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
-  const int use_color_cache = (cache_bits > 0);
-  int ix;
-  int i = 0;
-  int ok = 0;
-  int cc_init = 0;
-  VP8LColorCache hashers;
-
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
-  }
-
-  ClearBackwardRefs(refs);
-  for (ix = 0; ix < chosen_path_size; ++ix) {
-    const int len = chosen_path[ix];
-    if (len != 1) {
-      int k;
-      const int offset = HashChainFindOffset(hash_chain, i);
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
-      if (use_color_cache) {
-        for (k = 0; k < len; ++k) {
-          VP8LColorCacheInsert(&hashers, argb[i + k]);
-        }
-      }
-      i += len;
-    } else {
-      PixOrCopy v;
-      const int idx =
-          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
-      if (idx >= 0) {
-        // use_color_cache is true and hashers contains argb[i]
-        // push pixel as a color cache index
-        v = PixOrCopyCreateCacheIdx(idx);
-      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
-        v = PixOrCopyCreateLiteral(argb[i]);
-      }
-      BackwardRefsCursorAdd(refs, v);
-      ++i;
-    }
-  }
-  ok = !refs->error_;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  return ok;
-}
-
-// Returns 1 on success.
-static int BackwardReferencesTraceBackwards(
-    int xsize, int ysize, const uint32_t* const argb, int quality,
-    int cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs) {
-  int ok = 0;
-  const int dist_array_size = xsize * ysize;
-  uint16_t* chosen_path = NULL;
-  int chosen_path_size = 0;
-  uint16_t* dist_array =
-      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
-
-  if (dist_array == NULL) goto Error;
-
-  if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, argb, quality, cache_bits, hash_chain,
-      refs, dist_array)) {
-    goto Error;
-  }
-  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
-  if (!BackwardReferencesHashChainFollowChosenPath(
-          argb, cache_bits, chosen_path, chosen_path_size, hash_chain, refs)) {
-    goto Error;
-  }
-  ok = 1;
- Error:
-  WebPSafeFree(dist_array);
-  return ok;
-}
-
-static void BackwardReferences2DLocality(int xsize,
-                                         const VP8LBackwardRefs* const refs) {
-  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  while (VP8LRefsCursorOk(&c)) {
-    if (PixOrCopyIsCopy(c.cur_pos)) {
-      const int dist = c.cur_pos->argb_or_distance;
-      const int transformed_dist = DistanceToPlaneCode(xsize, dist);
-      c.cur_pos->argb_or_distance = transformed_dist;
-    }
-    VP8LRefsCursorNext(&c);
-  }
-}
-
-// Computes the entropies for a color cache size (in bits) between 0 (unused)
-// and cache_bits_max (inclusive).
-// Returns 1 on success, 0 in case of allocation error.
-static int ComputeCacheEntropies(const uint32_t* argb,
-                                 const VP8LBackwardRefs* const refs,
-                                 int cache_bits_max, double entropies[]) {
-  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
-  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
-  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
-  int ok = 0;
-  int i;
-
-  for (i = 0; i <= cache_bits_max; ++i) {
-    histos[i] = VP8LAllocateHistogram(i);
-    if (histos[i] == NULL) goto Error;
-    if (i == 0) continue;
-    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
-    if (!cc_init[i]) goto Error;
-  }
-
-  assert(cache_bits_max >= 0);
-  // Do not use the color cache for cache_bits=0.
-  while (VP8LRefsCursorOk(&c)) {
-    VP8LHistogramAddSinglePixOrCopy(histos[0], c.cur_pos);
-    VP8LRefsCursorNext(&c);
-  }
-  if (cache_bits_max > 0) {
-    c = VP8LRefsCursorInit(refs);
-    while (VP8LRefsCursorOk(&c)) {
-      const PixOrCopy* const v = c.cur_pos;
-      if (PixOrCopyIsLiteral(v)) {
-        const uint32_t pix = *argb++;
-        // The keys of the caches can be derived from the longest one.
-        int key = HashPix(pix, 32 - cache_bits_max);
-        for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
-          if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
-            ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
-          } else {
-            VP8LColorCacheSet(&hashers[i], key, pix);
-            ++histos[i]->blue_[pix & 0xff];
-            ++histos[i]->literal_[(pix >> 8) & 0xff];
-            ++histos[i]->red_[(pix >> 16) & 0xff];
-            ++histos[i]->alpha_[pix >> 24];
-          }
-        }
-      } else {
-        // Update the histograms for distance/length.
-        int len = PixOrCopyLength(v);
-        int code_dist, code_len, extra_bits;
-        uint32_t argb_prev = *argb ^ 0xffffffffu;
-        VP8LPrefixEncodeBits(len, &code_len, &extra_bits);
-        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code_dist, &extra_bits);
-        for (i = 1; i <= cache_bits_max; ++i) {
-          ++histos[i]->literal_[NUM_LITERAL_CODES + code_len];
-          ++histos[i]->distance_[code_dist];
-        }
-        // Update the colors caches.
-        do {
-          if (*argb != argb_prev) {
-            // Efficiency: insert only if the color changes.
-            int key = HashPix(*argb, 32 - cache_bits_max);
-            for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
-              hashers[i].colors_[key] = *argb;
-            }
-            argb_prev = *argb;
-          }
-          argb++;
-        } while (--len != 0);
-      }
-      VP8LRefsCursorNext(&c);
-    }
-  }
-  for (i = 0; i <= cache_bits_max; ++i) {
-    entropies[i] = VP8LHistogramEstimateBits(histos[i]);
-  }
-  ok = 1;
-Error:
-  for (i = 0; i <= cache_bits_max; ++i) {
-    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
-    VP8LFreeHistogram(histos[i]);
-  }
-  return ok;
-}
-
-// Evaluate optimal cache bits for the local color cache.
-// The input *best_cache_bits sets the maximum cache bits to use (passing 0
-// implies disabling the local color cache). The local color cache is also
-// disabled for the lower (<= 25) quality.
-// Returns 0 in case of memory error.
-static int CalculateBestCacheSize(const uint32_t* const argb,
-                                  int xsize, int ysize, int quality,
-                                  const VP8LHashChain* const hash_chain,
-                                  VP8LBackwardRefs* const refs,
-                                  int* const lz77_computed,
-                                  int* const best_cache_bits) {
-  int i;
-  int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits;
-  double entropy_min = MAX_ENTROPY;
-  double entropies[MAX_COLOR_CACHE_BITS + 1];
-
-  assert(cache_bits_high <= MAX_COLOR_CACHE_BITS);
-
-  *lz77_computed = 0;
-  if (cache_bits_high == 0) {
-    *best_cache_bits = 0;
-    // Local color cache is disabled.
-    return 1;
-  }
-  // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color cache
-  // is not that different in practice.
-  if (!BackwardReferencesLz77(xsize, ysize, argb, 0, hash_chain, refs)) {
-    return 0;
-  }
-  // Find the cache_bits giving the lowest entropy. The search is done in a
-  // brute-force way as the function (entropy w.r.t cache_bits) can be
-  // anything in practice.
-  if (!ComputeCacheEntropies(argb, refs, cache_bits_high, entropies)) {
-    return 0;
-  }
-  for (i = 0; i <= cache_bits_high; ++i) {
-    if (i == 0 || entropies[i] < entropy_min) {
-      entropy_min = entropies[i];
-      *best_cache_bits = i;
-    }
-  }
-  return 1;
-}
-
-// Update (in-place) backward references for specified cache_bits.
-static int BackwardRefsWithLocalCache(const uint32_t* const argb,
-                                      int cache_bits,
-                                      VP8LBackwardRefs* const refs) {
-  int pixel_index = 0;
-  VP8LColorCache hashers;
-  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
-
-  while (VP8LRefsCursorOk(&c)) {
-    PixOrCopy* const v = c.cur_pos;
-    if (PixOrCopyIsLiteral(v)) {
-      const uint32_t argb_literal = v->argb_or_distance;
-      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
-      if (ix >= 0) {
-        // hashers contains argb_literal
-        *v = PixOrCopyCreateCacheIdx(ix);
-      } else {
-        VP8LColorCacheInsert(&hashers, argb_literal);
-      }
-      ++pixel_index;
-    } else {
-      // refs was created without local cache, so it can not have cache indexes.
-      int k;
-      assert(PixOrCopyIsCopy(v));
-      for (k = 0; k < v->len; ++k) {
-        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
-      }
-    }
-    VP8LRefsCursorNext(&c);
-  }
-  VP8LColorCacheClear(&hashers);
-  return 1;
-}
-
-static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
-    int width, int height, const uint32_t* const argb,
-    int* const cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs_array[2]) {
-  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
-  *cache_bits = 0;
-  if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
-    return NULL;
-  }
-  BackwardReferences2DLocality(width, refs_lz77);
-  return refs_lz77;
-}
-
-static VP8LBackwardRefs* GetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int* const cache_bits, const VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs_array[2]) {
-  int lz77_is_useful;
-  int lz77_computed;
-  double bit_cost_lz77, bit_cost_rle;
-  VP8LBackwardRefs* best = NULL;
-  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
-  VP8LBackwardRefs* refs_rle = &refs_array[1];
-  VP8LHistogram* histo = NULL;
-
-  if (!CalculateBestCacheSize(argb, width, height, quality, hash_chain,
-                              refs_lz77, &lz77_computed, cache_bits)) {
-    goto Error;
-  }
-
-  if (lz77_computed) {
-    // Transform refs_lz77 for the optimized cache_bits.
-    if (*cache_bits > 0) {
-      if (!BackwardRefsWithLocalCache(argb, *cache_bits, refs_lz77)) {
-        goto Error;
-      }
-    }
-  } else {
-    if (!BackwardReferencesLz77(width, height, argb, *cache_bits, hash_chain,
-                                refs_lz77)) {
-      goto Error;
-    }
-  }
-
-  if (!BackwardReferencesRle(width, height, argb, *cache_bits, refs_rle)) {
-    goto Error;
-  }
-
-  histo = VP8LAllocateHistogram(*cache_bits);
-  if (histo == NULL) goto Error;
-
-  {
-    // Evaluate LZ77 coding.
-    VP8LHistogramCreate(histo, refs_lz77, *cache_bits);
-    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
-    // Evaluate RLE coding.
-    VP8LHistogramCreate(histo, refs_rle, *cache_bits);
-    bit_cost_rle = VP8LHistogramEstimateBits(histo);
-    // Decide if LZ77 is useful.
-    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
-  }
-
-  // Choose appropriate backward reference.
-  if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality.
-    const int try_lz77_trace_backwards = (quality >= 25);
-    best = refs_lz77;   // default guess: lz77 is better
-    if (try_lz77_trace_backwards) {
-      VP8LBackwardRefs* const refs_trace = refs_rle;
-      if (!VP8LBackwardRefsCopy(refs_lz77, refs_trace)) {
-        best = NULL;
-        goto Error;
-      }
-      if (BackwardReferencesTraceBackwards(width, height, argb, quality,
-                                           *cache_bits, hash_chain,
-                                           refs_trace)) {
-        double bit_cost_trace;
-        // Evaluate LZ77 coding.
-        VP8LHistogramCreate(histo, refs_trace, *cache_bits);
-        bit_cost_trace = VP8LHistogramEstimateBits(histo);
-        if (bit_cost_trace < bit_cost_lz77) {
-          best = refs_trace;
-        }
-      }
-    }
-  } else {
-    best = refs_rle;
-  }
-
-  BackwardReferences2DLocality(width, best);
-
- Error:
-  VP8LFreeHistogram(histo);
-  return best;
-}
-
-VP8LBackwardRefs* VP8LGetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[2]) {
-  if (low_effort) {
-    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
-                                          hash_chain, refs_array);
-  } else {
-    return GetBackwardReferences(width, height, argb, quality, cache_bits,
-                                 hash_chain, refs_array);
-  }
-}
diff --git a/thirdparty/libwebp/dec/alpha_dec.c b/thirdparty/libwebp/src/dec/alpha_dec.c
index 83ffd4b609..bce735bfc2 100644
--- a/thirdparty/libwebp/dec/alpha_dec.c
+++ b/thirdparty/libwebp/src/dec/alpha_dec.c
@@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./alphai_dec.h"
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "../dsp/dsp.h"
-#include "../utils/quant_levels_dec_utils.h"
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/quant_levels_dec_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 //------------------------------------------------------------------------------
 // ALPHDecoder object.
diff --git a/thirdparty/libwebp/dec/alphai_dec.h b/thirdparty/libwebp/src/dec/alphai_dec.h
index 561e8151ee..e0fa281a55 100644
--- a/thirdparty/libwebp/dec/alphai_dec.h
+++ b/thirdparty/libwebp/src/dec/alphai_dec.h
@@ -11,11 +11,11 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#ifndef WEBP_DEC_ALPHAI_H_
-#define WEBP_DEC_ALPHAI_H_
+#ifndef WEBP_DEC_ALPHAI_DEC_H_
+#define WEBP_DEC_ALPHAI_DEC_H_
 
-#include "./webpi_dec.h"
-#include "../utils/filters_utils.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/filters_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -51,4 +51,4 @@ void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_ALPHAI_H_ */
+#endif  /* WEBP_DEC_ALPHAI_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/buffer_dec.c b/thirdparty/libwebp/src/dec/buffer_dec.c
index c685fd5646..75eb3c40b4 100644
--- a/thirdparty/libwebp/dec/buffer_dec.c
+++ b/thirdparty/libwebp/src/dec/buffer_dec.c
@@ -13,15 +13,15 @@
 
 #include <stdlib.h>
 
-#include "./vp8i_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // WebPDecBuffer
 
 // Number of bytes per pixel for the different color-spaces.
-static const int kModeBpp[MODE_LAST] = {
+static const uint8_t kModeBpp[MODE_LAST] = {
   3, 4, 3, 4, 4, 2, 2,
   4, 4, 4, 2,    // pre-multiplied modes
   1, 1 };
@@ -36,7 +36,7 @@ static int IsValidColorspace(int webp_csp_mode) {
 // strictly speaking, the very last (or first, if flipped) row
 // doesn't require padding.
 #define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE)       \
-    (uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH)
+    ((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))
 
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
   int ok = 1;
@@ -98,9 +98,14 @@ static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
     uint64_t uv_size = 0, a_size = 0, total_size;
     // We need memory and it hasn't been allocated yet.
     // => initialize output buffer, now that dimensions are known.
-    const int stride = w * kModeBpp[mode];
-    const uint64_t size = (uint64_t)stride * h;
+    int stride;
+    uint64_t size;
 
+    if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
+      return VP8_STATUS_INVALID_PARAM;
+    }
+    stride = w * kModeBpp[mode];
+    size = (uint64_t)stride * h;
     if (!WebPIsRGBMode(mode)) {
       uv_stride = (w + 1) / 2;
       uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
@@ -169,11 +174,11 @@ VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
   return VP8_STATUS_OK;
 }
 
-VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
                                     const WebPDecoderOptions* const options,
-                                    WebPDecBuffer* const out) {
+                                    WebPDecBuffer* const buffer) {
   VP8StatusCode status;
-  if (out == NULL || w <= 0 || h <= 0) {
+  if (buffer == NULL || width <= 0 || height <= 0) {
     return VP8_STATUS_INVALID_PARAM;
   }
   if (options != NULL) {    // First, apply options if there is any.
@@ -182,33 +187,39 @@ VP8StatusCode WebPAllocateDecBuffer(int w, int h,
       const int ch = options->crop_height;
       const int x = options->crop_left & ~1;
       const int y = options->crop_top & ~1;
-      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
+          x + cw > width || y + ch > height) {
         return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
       }
-      w = cw;
-      h = ch;
+      width = cw;
+      height = ch;
     }
+
     if (options->use_scaling) {
+#if !defined(WEBP_REDUCE_SIZE)
       int scaled_width = options->scaled_width;
       int scaled_height = options->scaled_height;
       if (!WebPRescalerGetScaledDimensions(
-              w, h, &scaled_width, &scaled_height)) {
+              width, height, &scaled_width, &scaled_height)) {
         return VP8_STATUS_INVALID_PARAM;
       }
-      w = scaled_width;
-      h = scaled_height;
+      width = scaled_width;
+      height = scaled_height;
+#else
+      return VP8_STATUS_INVALID_PARAM;   // rescaling not supported
+#endif
     }
   }
-  out->width = w;
-  out->height = h;
+  buffer->width = width;
+  buffer->height = height;
 
   // Then, allocate buffer for real.
-  status = AllocateBuffer(out);
+  status = AllocateBuffer(buffer);
   if (status != VP8_STATUS_OK) return status;
 
   // Use the stride trick if vertical flip is needed.
   if (options != NULL && options->flip) {
-    status = WebPFlipBuffer(out);
+    status = WebPFlipBuffer(buffer);
   }
   return status;
 }
diff --git a/thirdparty/libwebp/dec/common_dec.h b/thirdparty/libwebp/src/dec/common_dec.h
index 6961e22470..9995f1a51a 100644
--- a/thirdparty/libwebp/dec/common_dec.h
+++ b/thirdparty/libwebp/src/dec/common_dec.h
@@ -11,8 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_DEC_COMMON_H_
-#define WEBP_DEC_COMMON_H_
+#ifndef WEBP_DEC_COMMON_DEC_H_
+#define WEBP_DEC_COMMON_DEC_H_
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -51,4 +51,4 @@ enum { MB_FEATURE_TREE_PROBS = 3,
        NUM_PROBAS = 11
      };
 
-#endif    // WEBP_DEC_COMMON_H_
+#endif    // WEBP_DEC_COMMON_DEC_H_
diff --git a/thirdparty/libwebp/dec/frame_dec.c b/thirdparty/libwebp/src/dec/frame_dec.c
index f91e27f7c8..517d0f5850 100644
--- a/thirdparty/libwebp/dec/frame_dec.c
+++ b/thirdparty/libwebp/src/dec/frame_dec.c
@@ -12,13 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Main reconstruction function.
 
-static const int kScan[16] = {
+static const uint16_t kScan[16] = {
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
@@ -320,7 +320,7 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 #define MIN_DITHER_AMP 4
 
 #define DITHER_AMP_TAB_SIZE 12
-static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
   // roughly, it's dqm->uv_mat_[1]
   8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
 };
@@ -728,7 +728,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   }
 
   mem = (uint8_t*)dec->mem_;
-  dec->intra_t_ = (uint8_t*)mem;
+  dec->intra_t_ = mem;
   mem += intra_pred_mode_size;
 
   dec->yuv_t_ = (VP8TopSamples*)mem;
@@ -750,7 +750,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
 
   mem = (uint8_t*)WEBP_ALIGN(mem);
   assert((yuv_size & WEBP_ALIGN_CST) == 0);
-  dec->yuv_b_ = (uint8_t*)mem;
+  dec->yuv_b_ = mem;
   mem += yuv_size;
 
   dec->mb_data_ = (VP8MBData*)mem;
@@ -766,7 +766,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
     const int extra_rows = kFilterExtraRows[dec->filter_type_];
     const int extra_y = extra_rows * dec->cache_y_stride_;
     const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
-    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_y_ = mem + extra_y;
     dec->cache_u_ = dec->cache_y_
                   + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
     dec->cache_v_ = dec->cache_u_
@@ -776,7 +776,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += cache_size;
 
   // alpha plane
-  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  dec->alpha_plane_ = alpha_size ? mem : NULL;
   mem += alpha_size;
   assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
 
diff --git a/thirdparty/libwebp/dec/idec_dec.c b/thirdparty/libwebp/src/dec/idec_dec.c
index 78fb2e7186..a371ed7500 100644
--- a/thirdparty/libwebp/dec/idec_dec.c
+++ b/thirdparty/libwebp/src/dec/idec_dec.c
@@ -15,10 +15,10 @@
 #include <string.h>
 #include <stdlib.h>
 
-#include "./alphai_dec.h"
-#include "./webpi_dec.h"
-#include "./vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
@@ -673,12 +673,12 @@ void WebPIDelete(WebPIDecoder* idec) {
 //------------------------------------------------------------------------------
 // Wrapper toward WebPINewDecoder
 
-WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
+WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
                           size_t output_buffer_size, int output_stride) {
   const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
   WebPIDecoder* idec;
 
-  if (mode >= MODE_YUV) return NULL;
+  if (csp >= MODE_YUV) return NULL;
   if (is_external_memory == 0) {    // Overwrite parameters to sane values.
     output_buffer_size = 0;
     output_stride = 0;
@@ -689,7 +689,7 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer,
   }
   idec = WebPINewDecoder(NULL);
   if (idec == NULL) return NULL;
-  idec->output_.colorspace = mode;
+  idec->output_.colorspace = csp;
   idec->output_.is_external_memory = is_external_memory;
   idec->output_.u.RGBA.rgba = output_buffer;
   idec->output_.u.RGBA.stride = output_stride;
diff --git a/thirdparty/libwebp/dec/io_dec.c b/thirdparty/libwebp/src/dec/io_dec.c
index 8bfab86959..e603f19c98 100644
--- a/thirdparty/libwebp/dec/io_dec.c
+++ b/thirdparty/libwebp/src/dec/io_dec.c
@@ -13,11 +13,11 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "../dec/vp8i_dec.h"
-#include "./webpi_dec.h"
-#include "../dsp/dsp.h"
-#include "../dsp/yuv.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/yuv.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
@@ -212,7 +212,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
     int num_rows;
     const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
     uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     uint8_t* alpha_dst = base_rgba;
 #else
     uint8_t* alpha_dst = base_rgba + 1;
@@ -241,6 +241,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
 //------------------------------------------------------------------------------
 // YUV rescaling (no final RGB conversion needed)
 
+#if !defined(WEBP_REDUCE_SIZE)
 static int Rescale(const uint8_t* src, int src_stride,
                    int new_lines, WebPRescaler* const wrk) {
   int num_lines_out = 0;
@@ -431,7 +432,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                                int max_lines_out) {
   const WebPRGBABuffer* const buf = &p->output->u.RGBA;
   uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   uint8_t* alpha_dst = base_rgba;
 #else
   uint8_t* alpha_dst = base_rgba + 1;
@@ -541,6 +542,8 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   return 1;
 }
 
+#endif  // WEBP_REDUCE_SIZE
+
 //------------------------------------------------------------------------------
 // Default custom functions
 
@@ -561,10 +564,14 @@ static int CustomSetup(VP8Io* io) {
     WebPInitUpsamplers();
   }
   if (io->use_scaling) {
+#if !defined(WEBP_REDUCE_SIZE)
     const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
     if (!ok) {
       return 0;    // memory error
     }
+#else
+    return 0;   // rescaling support not compiled
+#endif
   } else {
     if (is_rgb) {
       WebPInitSamplers();
@@ -598,9 +605,6 @@ static int CustomSetup(VP8Io* io) {
     }
   }
 
-  if (is_rgb) {
-    VP8YUVInit();
-  }
   return 1;
 }
 
diff --git a/thirdparty/libwebp/dec/quant_dec.c b/thirdparty/libwebp/src/dec/quant_dec.c
index 14e3198946..f07212ad73 100644
--- a/thirdparty/libwebp/dec/quant_dec.c
+++ b/thirdparty/libwebp/src/dec/quant_dec.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i_dec.h"
+#include "src/dec/vp8i_dec.h"
 
 static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
diff --git a/thirdparty/libwebp/dec/tree_dec.c b/thirdparty/libwebp/src/dec/tree_dec.c
index 9e805f60f3..3f5a957d32 100644
--- a/thirdparty/libwebp/dec/tree_dec.c
+++ b/thirdparty/libwebp/src/dec/tree_dec.c
@@ -11,15 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i_dec.h"
-#include "../utils/bit_reader_inl_utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/bit_reader_inl_utils.h"
 
+#if !defined(USE_GENERIC_TREE)
 #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
 // using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
-#define USE_GENERIC_TREE
+#define USE_GENERIC_TREE 1   // ALTERNATE_CODE
+#else
+#define USE_GENERIC_TREE 0
 #endif
+#endif  // USE_GENERIC_TREE
 
-#ifdef USE_GENERIC_TREE
+#if (USE_GENERIC_TREE == 1)
 static const int8_t kYModesIntra4[18] = {
   -B_DC_PRED, 1,
     -B_TM_PRED, 2,
@@ -317,7 +321,7 @@ static void ParseIntraMode(VP8BitReader* const br,
       int x;
       for (x = 0; x < 4; ++x) {
         const uint8_t* const prob = kBModesProba[top[x]][ymode];
-#ifdef USE_GENERIC_TREE
+#if (USE_GENERIC_TREE == 1)
         // Generic tree-parsing
         int i = kYModesIntra4[VP8GetBit(br, prob[0])];
         while (i > 0) {
@@ -335,7 +339,7 @@ static void ParseIntraMode(VP8BitReader* const br,
                         (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
                           (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
                             (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
-#endif    // USE_GENERIC_TREE
+#endif  // USE_GENERIC_TREE
         top[x] = ymode;
       }
       memcpy(modes, top, 4 * sizeof(*top));
@@ -498,7 +502,7 @@ static const uint8_t
 
 // Paragraph 9.9
 
-static const int kBands[16 + 1] = {
+static const uint8_t kBands[16 + 1] = {
   0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
   0  // extra entry as sentinel
 };
diff --git a/thirdparty/libwebp/dec/vp8_dec.c b/thirdparty/libwebp/src/dec/vp8_dec.c
index fad8d9cf35..6212efd179 100644
--- a/thirdparty/libwebp/dec/vp8_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8_dec.c
@@ -13,12 +13,12 @@
 
 #include <stdlib.h>
 
-#include "./alphai_dec.h"
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/bit_reader_inl_utils.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/bit_reader_inl_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/dec/vp8_dec.h b/thirdparty/libwebp/src/dec/vp8_dec.h
index b9337bbec0..ca85b340cf 100644
--- a/thirdparty/libwebp/dec/vp8_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8_dec.h
@@ -11,10 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_WEBP_DECODE_VP8_H_
-#define WEBP_WEBP_DECODE_VP8_H_
+#ifndef WEBP_DEC_VP8_DEC_H_
+#define WEBP_DEC_VP8_DEC_H_
 
-#include "../webp/decode.h"
+#include "src/webp/decode.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,7 +33,7 @@ extern "C" {
 //   /* customize io's functions (setup()/put()/teardown()) if needed. */
 //
 //   VP8Decoder* dec = VP8New();
-//   bool ok = VP8Decode(dec);
+//   int ok = VP8Decode(dec, &io);
 //   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
 //   VP8Delete(dec);
 //   return ok;
@@ -157,24 +157,24 @@ void VP8Delete(VP8Decoder* const dec);
 // Miscellaneous VP8/VP8L bitstream probing functions.
 
 // Returns true if the next 3 bytes in data contain the VP8 signature.
-WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);
 
 // Validates the VP8 data-header and retrieves basic header information viz
 // width and height. Returns 0 in case of formatting error. *width/*height
 // can be passed NULL.
-WEBP_EXTERN(int) VP8GetInfo(
+WEBP_EXTERN int VP8GetInfo(
     const uint8_t* data,
     size_t data_size,    // data available so far
     size_t chunk_size,   // total data size expected in the chunk
     int* const width, int* const height);
 
 // Returns true if the next byte(s) in data is a VP8L signature.
-WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);
 
 // Validates the VP8L data-header and retrieves basic header information viz
 // width, height and alpha. Returns 0 in case of formatting error.
 // width/height/has_alpha can be passed NULL.
-WEBP_EXTERN(int) VP8LGetInfo(
+WEBP_EXTERN int VP8LGetInfo(
     const uint8_t* data, size_t data_size,  // data available so far
     int* const width, int* const height, int* const has_alpha);
 
@@ -182,4 +182,4 @@ WEBP_EXTERN(int) VP8LGetInfo(
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
+#endif  /* WEBP_DEC_VP8_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/vp8i_dec.h b/thirdparty/libwebp/src/dec/vp8i_dec.h
index 555853e8f8..28244d9d7a 100644
--- a/thirdparty/libwebp/dec/vp8i_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8i_dec.h
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_DEC_VP8I_H_
-#define WEBP_DEC_VP8I_H_
+#ifndef WEBP_DEC_VP8I_DEC_H_
+#define WEBP_DEC_VP8I_DEC_H_
 
 #include <string.h>     // for memcpy()
-#include "./common_dec.h"
-#include "./vp8li_dec.h"
-#include "../utils/bit_reader_utils.h"
-#include "../utils/random_utils.h"
-#include "../utils/thread_utils.h"
-#include "../dsp/dsp.h"
+#include "src/dec/common_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/random_utils.h"
+#include "src/utils/thread_utils.h"
+#include "src/dsp/dsp.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 6
-#define DEC_REV_VERSION 0
+#define DEC_REV_VERSION 1
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
@@ -57,7 +57,6 @@ extern "C" {
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
 #define YUV_SIZE (BPS * 17 + BPS * 9)
-#define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
@@ -317,4 +316,4 @@ const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_VP8I_H_ */
+#endif  /* WEBP_DEC_VP8I_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/vp8l_dec.c b/thirdparty/libwebp/src/dec/vp8l_dec.c
index ef359a91f0..42ea3b5e4c 100644
--- a/thirdparty/libwebp/dec/vp8l_dec.c
+++ b/thirdparty/libwebp/src/dec/vp8l_dec.c
@@ -14,22 +14,22 @@
 
 #include <stdlib.h>
 
-#include "./alphai_dec.h"
-#include "./vp8li_dec.h"
-#include "../dsp/dsp.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../dsp/yuv.h"
-#include "../utils/endian_inl_utils.h"
-#include "../utils/huffman_utils.h"
-#include "../utils/utils.h"
+#include "src/dec/alphai_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/yuv.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/utils/huffman_utils.h"
+#include "src/utils/utils.h"
 
 #define NUM_ARGB_CACHE_ROWS          16
 
 static const int kCodeLengthLiterals = 16;
 static const int kCodeLengthRepeatCode = 16;
-static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
-static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
+static const uint8_t kCodeLengthExtraBits[3] = { 2, 3, 7 };
+static const uint8_t kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
 
 // -----------------------------------------------------------------------------
 //  Five Huffman codes are used at each meta code:
@@ -86,7 +86,7 @@ static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
 // All values computed for 8-bit first level lookup with Mark Adler's tool:
 // http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
 #define FIXED_TABLE_SIZE (630 * 3 + 410)
-static const int kTableSize[12] = {
+static const uint16_t kTableSize[12] = {
   FIXED_TABLE_SIZE + 654,
   FIXED_TABLE_SIZE + 656,
   FIXED_TABLE_SIZE + 658,
@@ -485,6 +485,7 @@ static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
 //------------------------------------------------------------------------------
 // Scaling.
 
+#if !defined(WEBP_REDUCE_SIZE)
 static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
   const int num_channels = 4;
   const int in_width = io->mb_w;
@@ -516,10 +517,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
                    out_width, out_height, 0, num_channels, work);
   return 1;
 }
+#endif   // WEBP_REDUCE_SIZE
 
 //------------------------------------------------------------------------------
 // Export to ARGB
 
+#if !defined(WEBP_REDUCE_SIZE)
+
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
@@ -561,6 +565,8 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
   return num_lines_out;
 }
 
+#endif   // WEBP_REDUCE_SIZE
+
 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
                     const uint8_t* row_in, int in_stride,
@@ -746,9 +752,12 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
       if (WebPIsRGBMode(output->colorspace)) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
         uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
-        const int num_rows_out = io->use_scaling ?
+        const int num_rows_out =
+#if !defined(WEBP_REDUCE_SIZE)
+         io->use_scaling ?
             EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
                                  rgba, buf->stride) :
+#endif  // WEBP_REDUCE_SIZE
             EmitRows(output->colorspace, rows_data, in_stride,
                      io->mb_w, io->mb_h, rgba, buf->stride);
         // Update 'last_out_row_'.
@@ -1012,12 +1021,13 @@ static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
       ok = 0;
       goto End;
     }
-    assert(br->eos_ == VP8LIsEndOfStream(br));
+    br->eos_ = VP8LIsEndOfStream(br);
   }
   // Process the remaining rows corresponding to last row-block.
   ExtractPalettedAlphaRows(dec, row > last_row ? last_row : row);
 
  End:
+  br->eos_ = VP8LIsEndOfStream(br);
   if (!ok || (br->eos_ && pos < end)) {
     ok = 0;
     dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
@@ -1090,11 +1100,12 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
     VP8LFillBitWindow(br);
     if (htree_group->use_packed_table) {
       code = ReadPackedSymbols(htree_group, br, src);
+      if (VP8LIsEndOfStream(br)) break;
       if (code == PACKED_NON_LITERAL_CODE) goto AdvanceByOne;
     } else {
       code = ReadSymbol(htree_group->htrees[GREEN], br);
     }
-    if (br->eos_) break;  // early out
+    if (VP8LIsEndOfStream(br)) break;
     if (code < NUM_LITERAL_CODES) {  // Literal
       if (htree_group->is_trivial_literal) {
         *src = htree_group->literal_arb | (code << 8);
@@ -1104,7 +1115,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
         VP8LFillBitWindow(br);
         blue = ReadSymbol(htree_group->htrees[BLUE], br);
         alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
-        if (br->eos_) break;
+        if (VP8LIsEndOfStream(br)) break;
         *src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
       }
     AdvanceByOne:
@@ -1132,7 +1143,7 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
       VP8LFillBitWindow(br);
       dist_code = GetCopyDistance(dist_symbol, br);
       dist = PlaneCodeToDistance(width, dist_code);
-      if (br->eos_) break;
+      if (VP8LIsEndOfStream(br)) break;
       if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
         goto Error;
       } else {
@@ -1169,9 +1180,9 @@ static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
     } else {  // Not reached
       goto Error;
     }
-    assert(br->eos_ == VP8LIsEndOfStream(br));
   }
 
+  br->eos_ = VP8LIsEndOfStream(br);
   if (dec->incremental_ && br->eos_ && src < src_end) {
     RestoreState(dec);
   } else if (!br->eos_) {
@@ -1630,12 +1641,19 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 
     if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
 
+#if !defined(WEBP_REDUCE_SIZE)
     if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
 
     if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
       // need the alpha-multiply functions for premultiplied output or rescaling
       WebPInitAlphaProcessing();
     }
+#else
+    if (io->use_scaling) {
+      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      goto Err;
+    }
+#endif
     if (!WebPIsRGBMode(dec->output_->colorspace)) {
       WebPInitConvertARGBToYUV();
       if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
diff --git a/thirdparty/libwebp/dec/vp8li_dec.h b/thirdparty/libwebp/src/dec/vp8li_dec.h
index 097a9d0589..8e500cf9ff 100644
--- a/thirdparty/libwebp/dec/vp8li_dec.h
+++ b/thirdparty/libwebp/src/dec/vp8li_dec.h
@@ -12,14 +12,14 @@
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora(vikaas.arora@gmail.com)
 
-#ifndef WEBP_DEC_VP8LI_H_
-#define WEBP_DEC_VP8LI_H_
+#ifndef WEBP_DEC_VP8LI_DEC_H_
+#define WEBP_DEC_VP8LI_DEC_H_
 
 #include <string.h>     // for memcpy()
-#include "./webpi_dec.h"
-#include "../utils/bit_reader_utils.h"
-#include "../utils/color_cache_utils.h"
-#include "../utils/huffman_utils.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/huffman_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -132,4 +132,4 @@ void VP8LDelete(VP8LDecoder* const dec);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_VP8LI_H_ */
+#endif  /* WEBP_DEC_VP8LI_DEC_H_ */
diff --git a/thirdparty/libwebp/dec/webp_dec.c b/thirdparty/libwebp/src/dec/webp_dec.c
index a8e9c2c510..42d098874d 100644
--- a/thirdparty/libwebp/dec/webp_dec.c
+++ b/thirdparty/libwebp/src/dec/webp_dec.c
@@ -13,11 +13,11 @@
 
 #include <stdlib.h>
 
-#include "./vp8i_dec.h"
-#include "./vp8li_dec.h"
-#include "./webpi_dec.h"
-#include "../utils/utils.h"
-#include "../webp/mux_types.h"  // ALPHA_FLAG
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/dec/webpi_dec.h"
+#include "src/utils/utils.h"
+#include "src/webp/mux_types.h"  // ALPHA_FLAG
 
 //------------------------------------------------------------------------------
 // RIFF layout is:
@@ -421,7 +421,9 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
                                 NULL, NULL, NULL, &has_animation,
                                 NULL, headers);
   if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
-    // TODO(jzern): full support of animation frames will require API additions.
+    // The WebPDemux API + libwebp can be used to decode individual
+    // uncomposited frames or the WebPAnimDecoder can be used to fully
+    // reconstruct them (see webp/demux.h).
     if (has_animation) {
       status = VP8_STATUS_UNSUPPORTED_FEATURE;
     }
diff --git a/thirdparty/libwebp/dec/webpi_dec.h b/thirdparty/libwebp/src/dec/webpi_dec.h
index 696abc1958..c378ba6fc3 100644
--- a/thirdparty/libwebp/dec/webpi_dec.h
+++ b/thirdparty/libwebp/src/dec/webpi_dec.h
@@ -11,15 +11,15 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#ifndef WEBP_DEC_WEBPI_H_
-#define WEBP_DEC_WEBPI_H_
+#ifndef WEBP_DEC_WEBPI_DEC_H_
+#define WEBP_DEC_WEBPI_DEC_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "../utils/rescaler_utils.h"
-#include "./vp8_dec.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/dec/vp8_dec.h"
 
 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@@ -130,4 +130,4 @@ int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_DEC_WEBPI_H_ */
+#endif  /* WEBP_DEC_WEBPI_DEC_H_ */
diff --git a/thirdparty/libwebp/demux/anim_decode.c b/thirdparty/libwebp/src/demux/anim_decode.c
index f1cf176e72..05dd707371 100644
--- a/thirdparty/libwebp/demux/anim_decode.c
+++ b/thirdparty/libwebp/src/demux/anim_decode.c
@@ -11,15 +11,15 @@
 //
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <assert.h>
 #include <string.h>
 
-#include "../utils/utils.h"
-#include "../webp/decode.h"
-#include "../webp/demux.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"
+#include "src/webp/demux.h"
 
 #define NUM_CHANNELS 4
 
diff --git a/thirdparty/libwebp/demux/demux.c b/thirdparty/libwebp/src/demux/demux.c
index 100eab8c01..79c24a5a7f 100644
--- a/thirdparty/libwebp/demux/demux.c
+++ b/thirdparty/libwebp/src/demux/demux.c
@@ -11,21 +11,21 @@
 //
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "../utils/utils.h"
-#include "../webp/decode.h"     // WebPGetFeatures
-#include "../webp/demux.h"
-#include "../webp/format_constants.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"     // WebPGetFeatures
+#include "src/webp/demux.h"
+#include "src/webp/format_constants.h"
 
 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 2
+#define DMUX_REV_VERSION 3
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -205,12 +205,14 @@ static void SetFrameInfo(size_t start_offset, size_t size,
   frame->complete_ = complete;
 }
 
-// Store image bearing chunks to 'frame'.
+// Store image bearing chunks to 'frame'. 'min_size' is an optional size
+// requirement, it may be zero.
 static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
                               MemBuffer* const mem, Frame* const frame) {
   int alpha_chunks = 0;
   int image_chunks = 0;
-  int done = (MemDataSize(mem) < min_size);
+  int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
+              MemDataSize(mem) < min_size);
   ParseStatus status = PARSE_OK;
 
   if (done) return PARSE_NEED_MORE_DATA;
@@ -401,9 +403,9 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
   frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
   if (frame == NULL) return PARSE_ERROR;
 
-  // For the single image case we allow parsing of a partial frame, but we need
-  // at least CHUNK_HEADER_SIZE for parsing.
-  status = StoreFrame(1, CHUNK_HEADER_SIZE, &dmux->mem_, frame);
+  // For the single image case we allow parsing of a partial frame, so no
+  // minimum size is imposed here.
+  status = StoreFrame(1, 0, &dmux->mem_, frame);
   if (status != PARSE_ERROR) {
     const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
     // Clear any alpha when the alpha flag is missing.
diff --git a/thirdparty/libwebp/dsp/alpha_processing.c b/thirdparty/libwebp/src/dsp/alpha_processing.c
index 4b60e092be..590e3bc312 100644
--- a/thirdparty/libwebp/dsp/alpha_processing.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing.c
@@ -12,10 +12,13 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 // Tables can be faster on some platform but incur some extra binary size (~2k).
-// #define USE_TABLES_FOR_ALPHA_MULT
+#if !defined(USE_TABLES_FOR_ALPHA_MULT)
+#define USE_TABLES_FOR_ALPHA_MULT 0   // ALTERNATE_CODE
+#endif
+
 
 // -----------------------------------------------------------------------------
 
@@ -29,7 +32,7 @@ static uint32_t Mult(uint8_t x, uint32_t mult) {
   return v;
 }
 
-#ifdef USE_TABLES_FOR_ALPHA_MULT
+#if (USE_TABLES_FOR_ALPHA_MULT == 1)
 
 static const uint32_t kMultTables[2][256] = {
   {    // (255u << MFIX) / alpha
@@ -132,9 +135,9 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
   return inverse ? (255u << MFIX) / a : a * KINV_255;
 }
 
-#endif    // USE_TABLES_FOR_ALPHA_MULT
+#endif  // USE_TABLES_FOR_ALPHA_MULT
 
-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t argb = ptr[x];
@@ -154,8 +157,8 @@ void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
-                  int width, int inverse) {
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+                   int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t a = alpha[x];
@@ -217,8 +220,9 @@ void WebPMultRows(uint8_t* ptr, int stride,
 #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
 #endif
 
-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
-                               int w, int h, int stride) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
+                                 int w, int h, int stride) {
   while (h-- > 0) {
     uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
     const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
@@ -235,6 +239,7 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
     rgba += stride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 #undef MULTIPLIER
 #undef PREMULTIPLY
 
@@ -254,9 +259,9 @@ static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
   return (x * m) >> 16;
 }
 
-static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
-                                               int w, int h, int stride,
-                                               int rg_byte_pos /* 0 or 1 */) {
+static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
+                                                 int w, int h, int stride,
+                                                 int rg_byte_pos /* 0 or 1 */) {
   while (h-- > 0) {
     int i;
     for (i = 0; i < w; ++i) {
@@ -275,15 +280,16 @@ static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
 }
 #undef MULTIPLIER
 
-static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
-                                   int w, int h, int stride) {
-#ifdef WEBP_SWAP_16BIT_CSP
-  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
+static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
+                                     int w, int h, int stride) {
+#if (WEBP_SWAP_16BIT_CSP == 1)
+  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
 #else
-  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
+  ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
 #endif
 }
 
+#if !WEBP_NEON_OMIT_C_CODE
 static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
                            int width, int height,
                            uint8_t* dst, int dst_stride) {
@@ -338,6 +344,36 @@ static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
   int i;
   for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
+
+//------------------------------------------------------------------------------
+
+static int HasAlpha8b_C(const uint8_t* src, int length) {
+  while (length-- > 0) if (*src++ != 0xff) return 1;
+  return 0;
+}
+
+static int HasAlpha32b_C(const uint8_t* src, int length) {
+  int x;
+  for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Simple channel manipulations.
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                      int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
@@ -345,6 +381,11 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out);
+
+int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+int (*WebPHasAlpha32b)(const uint8_t* src, int length);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -360,15 +401,21 @@ static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
   if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  WebPMultARGBRow = WebPMultARGBRowC;
-  WebPMultRow = WebPMultRowC;
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
-  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPMultARGBRow = WebPMultARGBRow_C;
+  WebPMultRow = WebPMultRow_C;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
 
+  WebPPackRGB = PackRGB_C;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
   WebPDispatchAlpha = DispatchAlpha_C;
   WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
   WebPExtractAlpha = ExtractAlpha_C;
   WebPExtractGreen = ExtractGreen_C;
+#endif
+
+  WebPHasAlpha8b = HasAlpha8b_C;
+  WebPHasAlpha32b = HasAlpha32b_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -382,16 +429,31 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
 #endif
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitAlphaProcessingNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitAlphaProcessingMIPSdspR2();
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitAlphaProcessingNEON();
+  }
+#endif
+
+  assert(WebPMultARGBRow != NULL);
+  assert(WebPMultRow != NULL);
+  assert(WebPApplyAlphaMultiply != NULL);
+  assert(WebPApplyAlphaMultiply4444 != NULL);
+  assert(WebPDispatchAlpha != NULL);
+  assert(WebPDispatchAlphaToGreen != NULL);
+  assert(WebPExtractAlpha != NULL);
+  assert(WebPExtractGreen != NULL);
+  assert(WebPPackRGB != NULL);
+  assert(WebPHasAlpha8b != NULL);
+  assert(WebPHasAlpha32b != NULL);
+
   alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/alpha_processing_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
index c631d78905..e0dc91bab9 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
@@ -12,13 +12,13 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut  (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
+                                   int width, int height,
+                                   uint8_t* dst, int dst_stride) {
   uint32_t alpha_mask = 0xffffffff;
   int i, j, temp0;
 
@@ -79,7 +79,8 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
+                                  int inverse) {
   int x;
   const uint32_t c_00ffffff = 0x00ffffffu;
   const uint32_t c_ff000000 = 0xff000000u;
@@ -124,14 +125,54 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
+static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
+                              const uint8_t* b, int len, int step,
+                              uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void WebPInitAlphaProcessingMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPMultARGBRow = MultARGBRow;
+  WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
+  WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+  WebPPackRGB = PackRGB_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/alpha_processing_neon.c b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c
index 606a401cf7..9d55421704 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_neon.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
-#include "./neon.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/dsp/alpha_processing_sse2.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 83dc559fac..76587006a1 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_sse2.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
 
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
+                              int width, int height,
+                              uint8_t* dst, int dst_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -72,9 +72,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
   return (alpha_and != 0xff);
 }
 
-static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
-                                 int width, int height,
-                                 uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
+                                      int width, int height,
+                                      uint32_t* dst, int dst_stride) {
   int i, j;
   const __m128i zero = _mm_setzero_si128();
   const int limit = width & ~15;
@@ -98,9 +98,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
+                             int width, int height,
+                             uint8_t* alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -210,6 +210,61 @@ static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
 #undef MULTIPLIER
 #undef PREMULTIPLY
 
+//------------------------------------------------------------------------------
+// Alpha detection
+
+static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
+  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  int i = 0;
+  for (; i + 16 <= length; i += 16) {
+    const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i < length; ++i) if (src[i] != 0xff) return 1;
+  return 0;
+}
+
+static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
+  const __m128i alpha_mask = _mm_set1_epi32(0xff);
+  const __m128i all_0xff = _mm_set1_epi8(0xff);
+  int i = 0;
+  // We don't know if we can access the last 3 bytes after the last alpha
+  // value 'src[4 * length - 4]' (because we don't know if alpha is the first
+  // or the last byte of the quadruplet). Hence the '-3' protection below.
+  length = length * 4 - 3;   // size in bytes
+  for (; i + 64 <= length; i += 64) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
+    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+    const __m128i b2 = _mm_and_si128(a2, alpha_mask);
+    const __m128i b3 = _mm_and_si128(a3, alpha_mask);
+    const __m128i c0 = _mm_packs_epi32(b0, b1);
+    const __m128i c1 = _mm_packs_epi32(b2, b3);
+    const __m128i d  = _mm_packus_epi16(c0, c1);
+    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i + 32 <= length; i += 32) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i +  0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
+    const __m128i b0 = _mm_and_si128(a0, alpha_mask);
+    const __m128i b1 = _mm_and_si128(a1, alpha_mask);
+    const __m128i c  = _mm_packs_epi32(b0, b1);
+    const __m128i d  = _mm_packus_epi16(c, c);
+    const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
+    const int mask = _mm_movemask_epi8(bits);
+    if (mask != 0xffff) return 1;
+  }
+  for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
+  return 0;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
@@ -238,7 +293,7 @@ static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
     }
   }
   width -= x;
-  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
+  if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
 }
 
 static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
@@ -261,7 +316,7 @@ static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
     }
   }
   width -= x;
-  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
+  if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
 }
 
 //------------------------------------------------------------------------------
@@ -273,9 +328,12 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
   WebPMultARGBRow = MultARGBRow_SSE2;
   WebPMultRow = MultRow_SSE2;
   WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
-  WebPExtractAlpha = ExtractAlpha;
+  WebPDispatchAlpha = DispatchAlpha_SSE2;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
+  WebPExtractAlpha = ExtractAlpha_SSE2;
+
+  WebPHasAlpha8b = HasAlpha8b_SSE2;
+  WebPHasAlpha32b = HasAlpha32b_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/alpha_processing_sse41.c b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c
index 986fde94ed..56040f9c88 100644
--- a/thirdparty/libwebp/dsp/alpha_processing_sse41.c
+++ b/thirdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 
@@ -19,9 +19,9 @@
 
 //------------------------------------------------------------------------------
 
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
+                              int width, int height,
+                              uint8_t* alpha, int alpha_stride) {
   // alpha_and stores an 'and' operation of all the alpha[] values. The final
   // value is not 0xff if any of the alpha[] is not equal to 0xff.
   uint32_t alpha_and = 0xff;
@@ -82,7 +82,7 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
 extern void WebPInitAlphaProcessingSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
-  WebPExtractAlpha = ExtractAlpha;
+  WebPExtractAlpha = ExtractAlpha_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/common_sse2.h b/thirdparty/libwebp/src/dsp/common_sse2.h
index 995d7cf4ea..995d7cf4ea 100644
--- a/thirdparty/libwebp/dsp/common_sse2.h
+++ b/thirdparty/libwebp/src/dsp/common_sse2.h
diff --git a/thirdparty/libwebp/dsp/cost.c b/thirdparty/libwebp/src/dsp/cost.c
index 58ddea7248..a732389d58 100644
--- a/thirdparty/libwebp/dsp/cost.c
+++ b/thirdparty/libwebp/src/dsp/cost.c
@@ -9,8 +9,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
@@ -319,7 +319,7 @@ const uint8_t VP8EncBands[16 + 1] = {
 //------------------------------------------------------------------------------
 // Mode costs
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
   int n = res->first;
   // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
   const int p0 = res->prob[n][ctx0][0];
@@ -354,8 +354,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   return cost;
 }
 
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
+static void SetResidualCoeffs_C(const int16_t* const coeffs,
+                                VP8Residual* const res) {
   int n;
   res->last = -1;
   assert(res->first == 0 || coeffs[0] == 0);
@@ -384,8 +384,8 @@ static volatile VP8CPUInfo cost_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
   if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  VP8GetResidualCost = GetResidualCost;
-  VP8SetResidualCoeffs = SetResidualCoeffs;
+  VP8GetResidualCost = GetResidualCost_C;
+  VP8SetResidualCoeffs = SetResidualCoeffs_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
diff --git a/thirdparty/libwebp/dsp/cost_mips32.c b/thirdparty/libwebp/src/dsp/cost_mips32.c
index 3102da877a..0500f88c13 100644
--- a/thirdparty/libwebp/dsp/cost_mips32.c
+++ b/thirdparty/libwebp/src/dsp/cost_mips32.c
@@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
   int v_reg, ctx_reg;
   int n = res->first;
@@ -96,8 +96,8 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   return cost;
 }
 
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
+static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
+                                     VP8Residual* const res) {
   const int16_t* p_coeffs = (int16_t*)coeffs;
   int temp0, temp1, temp2, n, n1;
   assert(res->first == 0 || coeffs[0] == 0);
@@ -143,8 +143,8 @@ static void SetResidualCoeffs(const int16_t* const coeffs,
 extern void VP8EncDspCostInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
-  VP8GetResidualCost = GetResidualCost;
-  VP8SetResidualCoeffs = SetResidualCoeffs;
+  VP8GetResidualCost = GetResidualCost_MIPS32;
+  VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/cost_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
index 6ec8aeb610..51248de7a1 100644
--- a/thirdparty/libwebp/dsp/cost_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
@@ -9,13 +9,13 @@
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "../enc/cost_enc.h"
+#include "src/enc/cost_enc.h"
 
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
   int v_reg, ctx_reg;
   int n = res->first;
@@ -97,7 +97,7 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
-  VP8GetResidualCost = GetResidualCost;
+  VP8GetResidualCost = GetResidualCost_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/cost_sse2.c b/thirdparty/libwebp/src/dsp/cost_sse2.c
index 421d51fdd5..487a079921 100644
--- a/thirdparty/libwebp/dsp/cost_sse2.c
+++ b/thirdparty/libwebp/src/dsp/cost_sse2.c
@@ -11,19 +11,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
-static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
-                                  VP8Residual* const res) {
+static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
+                                   VP8Residual* const res) {
   const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
   const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
   // Use SSE2 to compare 16 values with a single instruction.
@@ -42,7 +42,7 @@ static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
   res->coeffs = coeffs;
 }
 
-static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
+static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
   uint8_t levels[16], ctxs[16];
   uint16_t abs_levels[16];
   int n = res->first;
@@ -108,8 +108,8 @@ static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
 extern void VP8EncDspCostInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
-  VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
-  VP8GetResidualCost = GetResidualCostSSE2;
+  VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
+  VP8GetResidualCost = GetResidualCost_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/cpu.c b/thirdparty/libwebp/src/dsp/cpu.c
index b5583b6e9b..8b40feed29 100644
--- a/thirdparty/libwebp/dsp/cpu.c
+++ b/thirdparty/libwebp/src/dsp/cpu.c
@@ -11,7 +11,7 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_HAVE_NEON_RTCD)
 #include <stdio.h>
@@ -143,7 +143,7 @@ static int x86CPUInfo(CPUFeature feature) {
     return !!(cpu_info[2] & (1 << 0));
   }
   if (feature == kSlowSSSE3) {
-    if (is_intel && (cpu_info[2] & (1 << 0))) {   // SSSE3?
+    if (is_intel && (cpu_info[2] & (1 << 9))) {   // SSSE3?
       return CheckSlowModel(cpu_info[0]);
     }
     return 0;
diff --git a/thirdparty/libwebp/dsp/dec.c b/thirdparty/libwebp/src/dsp/dec.c
index 007e985d8b..7e82407567 100644
--- a/thirdparty/libwebp/dsp/dec.c
+++ b/thirdparty/libwebp/src/dsp/dec.c
@@ -11,9 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include <assert.h>
+
+#include "src/dsp/dsp.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -25,7 +27,7 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 // Transforms (Paragraph 14.4)
 
 #define STORE(x, y, v) \
-  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
+  dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
 
 #define STORE2(y, dc, d, c) do {    \
   const int DC = (dc);              \
@@ -38,7 +40,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define MUL1(a) ((((a) * 20091) >> 16) + (a))
 #define MUL2(a) (((a) * 35468) >> 16)
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformOne_C(const int16_t* in, uint8_t* dst) {
   int C[4 * 4], *tmp;
   int i;
   tmp = C;
@@ -78,7 +81,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 }
 
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
   const int a = in[0] + 4;
   const int c4 = MUL2(in[4]);
   const int d4 = MUL1(in[4]);
@@ -93,19 +96,21 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 #undef MUL2
 #undef STORE2
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne(in, dst);
+static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne_C(in, dst);
   if (do_two) {
-    TransformOne(in + 16, dst + 4);
+    TransformOne_C(in + 16, dst + 4);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void TransformUV(const int16_t* in, uint8_t* dst) {
+static void TransformUV_C(const int16_t* in, uint8_t* dst) {
   VP8Transform(in + 0 * 16, dst, 1);
   VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }
 
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformDC_C(const int16_t* in, uint8_t* dst) {
   const int DC = in[0] + 4;
   int i, j;
   for (j = 0; j < 4; ++j) {
@@ -114,8 +119,9 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
     }
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void TransformDCUV(const int16_t* in, uint8_t* dst) {
+static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
   if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
   if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
   if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
@@ -127,7 +133,8 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) {
 //------------------------------------------------------------------------------
 // Paragraph 14.3
 
-static void TransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void TransformWHT_C(const int16_t* in, int16_t* out) {
   int tmp[16];
   int i;
   for (i = 0; i < 4; ++i) {
@@ -153,6 +160,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
     out += 64;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
@@ -161,6 +169,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
 #define DST(x, y) dst[(x) + (y) * BPS]
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   const uint8_t* top = dst - BPS;
   const uint8_t* const clip0 = VP8kclip1 - top[-1];
@@ -174,21 +183,21 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
     dst += BPS;
   }
 }
-static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+static void TM4_C(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16_C(uint8_t* dst)  { TrueMotion(dst, 16); }
 
 //------------------------------------------------------------------------------
 // 16x16
 
-static void VE16(uint8_t* dst) {     // vertical
+static void VE16_C(uint8_t* dst) {     // vertical
   int j;
   for (j = 0; j < 16; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 16);
   }
 }
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_C(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
     memset(dst, dst[-1], 16);
@@ -203,7 +212,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
   }
 }
 
-static void DC16(uint8_t* dst) {    // DC
+static void DC16_C(uint8_t* dst) {    // DC
   int DC = 16;
   int j;
   for (j = 0; j < 16; ++j) {
@@ -212,7 +221,7 @@ static void DC16(uint8_t* dst) {    // DC
   Put16(DC >> 5, dst);
 }
 
-static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+static void DC16NoTop_C(uint8_t* dst) {   // DC with top samples not available
   int DC = 8;
   int j;
   for (j = 0; j < 16; ++j) {
@@ -221,7 +230,7 @@ static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
   Put16(DC >> 4, dst);
 }
 
-static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+static void DC16NoLeft_C(uint8_t* dst) {  // DC with left samples not available
   int DC = 8;
   int i;
   for (i = 0; i < 16; ++i) {
@@ -230,9 +239,10 @@ static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
   Put16(DC >> 4, dst);
 }
 
-static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+static void DC16NoTopLeft_C(uint8_t* dst) {  // DC with no top and left samples
   Put16(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 
@@ -242,7 +252,8 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
-static void VE4(uint8_t* dst) {    // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE4_C(uint8_t* dst) {    // vertical
   const uint8_t* top = dst - BPS;
   const uint8_t vals[4] = {
     AVG3(top[-1], top[0], top[1]),
@@ -255,8 +266,9 @@ static void VE4(uint8_t* dst) {    // vertical
     memcpy(dst + i * BPS, vals, sizeof(vals));
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HE4(uint8_t* dst) {    // horizontal
+static void HE4_C(uint8_t* dst) {    // horizontal
   const int A = dst[-1 - BPS];
   const int B = dst[-1];
   const int C = dst[-1 + BPS];
@@ -268,7 +280,8 @@ static void HE4(uint8_t* dst) {    // horizontal
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }
 
-static void DC4(uint8_t* dst) {   // DC
+#if !WEBP_NEON_OMIT_C_CODE
+static void DC4_C(uint8_t* dst) {   // DC
   uint32_t dc = 4;
   int i;
   for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@@ -276,7 +289,7 @@ static void DC4(uint8_t* dst) {   // DC
   for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }
 
-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_C(uint8_t* dst) {   // Down-right
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -295,7 +308,7 @@ static void RD4(uint8_t* dst) {   // Down-right
                                       DST(3, 0) = AVG3(D, C, B);
 }
 
-static void LD4(uint8_t* dst) {   // Down-Left
+static void LD4_C(uint8_t* dst) {   // Down-Left
   const int A = dst[0 - BPS];
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
@@ -312,8 +325,9 @@ static void LD4(uint8_t* dst) {   // Down-Left
                           DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
                                       DST(3, 3) = AVG3(G, H, H);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void VR4(uint8_t* dst) {   // Vertical-Right
+static void VR4_C(uint8_t* dst) {   // Vertical-Right
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -335,7 +349,7 @@ static void VR4(uint8_t* dst) {   // Vertical-Right
   DST(3, 1) =             AVG3(B, C, D);
 }
 
-static void VL4(uint8_t* dst) {   // Vertical-Left
+static void VL4_C(uint8_t* dst) {   // Vertical-Left
   const int A = dst[0 - BPS];
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
@@ -357,7 +371,7 @@ static void VL4(uint8_t* dst) {   // Vertical-Left
               DST(3, 3) = AVG3(F, G, H);
 }
 
-static void HU4(uint8_t* dst) {   // Horizontal-Up
+static void HU4_C(uint8_t* dst) {   // Horizontal-Up
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -372,7 +386,7 @@ static void HU4(uint8_t* dst) {   // Horizontal-Up
     DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
-static void HD4(uint8_t* dst) {  // Horizontal-Down
+static void HD4_C(uint8_t* dst) {  // Horizontal-Down
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -404,14 +418,15 @@ VP8PredFunc VP8PredLuma4[NUM_BMODES];
 //------------------------------------------------------------------------------
 // Chroma
 
-static void VE8uv(uint8_t* dst) {    // vertical
+#if !WEBP_NEON_OMIT_C_CODE
+static void VE8uv_C(uint8_t* dst) {    // vertical
   int j;
   for (j = 0; j < 8; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 8);
   }
 }
 
-static void HE8uv(uint8_t* dst) {    // horizontal
+static void HE8uv_C(uint8_t* dst) {    // horizontal
   int j;
   for (j = 0; j < 8; ++j) {
     memset(dst, dst[-1], 8);
@@ -427,7 +442,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
   }
 }
 
-static void DC8uv(uint8_t* dst) {     // DC
+static void DC8uv_C(uint8_t* dst) {     // DC
   int dc0 = 8;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -436,7 +451,7 @@ static void DC8uv(uint8_t* dst) {     // DC
   Put8x8uv(dc0 >> 4, dst);
 }
 
-static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft_C(uint8_t* dst) {   // DC with no left samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -445,7 +460,7 @@ static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
   Put8x8uv(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop_C(uint8_t* dst) {  // DC with no top samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -454,17 +469,19 @@ static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
   Put8x8uv(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+static void DC8uvNoTopLeft_C(uint8_t* dst) {    // DC with nothing
   Put8x8uv(0x80, dst);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
 
 //------------------------------------------------------------------------------
 // Edge filtering functions
 
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 // 4 pixels in, 2 pixels out
-static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
   const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
@@ -474,7 +491,7 @@ static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
 }
 
 // 4 pixels in, 4 pixels out
-static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   const int a = 3 * (q0 - p0);
   const int a1 = VP8ksclip2[(a + 4) >> 3];
@@ -487,7 +504,7 @@ static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
 }
 
 // 6 pixels in, 6 pixels out
-static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
   const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2*step];
   const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
@@ -503,18 +520,22 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
   p[ 2*step] = VP8kclip1[q2 - a3];
 }
 
-static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
   const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
   return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static WEBP_INLINE int needs_filter2(const uint8_t* p,
-                                     int step, int t, int it) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
+                                      int step, int t, int it) {
   const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
   const int p0 = p[-step], q0 = p[0];
   const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
@@ -523,140 +544,159 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
          VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
          VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
   int i;
   const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh2)) {
-      do_filter2(p + i, stride);
+    if (NeedsFilter_C(p + i, stride, thresh2)) {
+      DoFilter2_C(p + i, stride);
     }
   }
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
   int i;
   const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh2)) {
-      do_filter2(p + i * stride, 1);
+    if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
+      DoFilter2_C(p + i * stride, 1);
     }
   }
 }
 
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_C(p, stride, thresh);
   }
 }
 
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_C(p, stride, thresh);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
-static WEBP_INLINE void FilterLoop26(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
+                                       int hstride, int vstride, int size,
+                                       int thresh, int ithresh,
+                                       int hev_thresh) {
   const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh2, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
+    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+      if (Hev(p, hstride, hev_thresh)) {
+        DoFilter2_C(p, hstride);
       } else {
-        do_filter6(p, hstride);
+        DoFilter6_C(p, hstride);
       }
     }
     p += vstride;
   }
 }
 
-static WEBP_INLINE void FilterLoop24(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
+static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
+                                       int hstride, int vstride, int size,
+                                       int thresh, int ithresh,
+                                       int hev_thresh) {
   const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh2, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
+    if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
+      if (Hev(p, hstride, hev_thresh)) {
+        DoFilter2_C(p, hstride);
       } else {
-        do_filter4(p, hstride);
+        DoFilter4_C(p, hstride);
       }
     }
     p += vstride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
+#if !WEBP_NEON_OMIT_C_CODE
 // on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+static void VFilter16_C(uint8_t* p, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
 }
 
-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+static void HFilter16_C(uint8_t* p, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
 }
 
 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_C(uint8_t* p, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4 * stride;
-    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter16i_C(uint8_t* p, int stride,
+                         int thresh, int ithresh, int hev_thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4;
-    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+    FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
+#if !WEBP_NEON_OMIT_C_CODE
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
-  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE
+static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
+                        int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 
-static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
-                             int dst_stride) {
+static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
+                               int dst_stride) {
   int i, j;
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) {
@@ -709,54 +749,66 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
 
   VP8InitClipTables();
 
-  VP8TransformWHT = TransformWHT;
-  VP8Transform = TransformTwo;
-  VP8TransformUV = TransformUV;
-  VP8TransformDC = TransformDC;
-  VP8TransformDCUV = TransformDCUV;
-  VP8TransformAC3 = TransformAC3;
-
-  VP8VFilter16 = VFilter16;
-  VP8HFilter16 = HFilter16;
-  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16i;
-  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8i;
-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
-
-  VP8PredLuma4[0] = DC4;
-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[3] = HE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[5] = VR4;
-  VP8PredLuma4[6] = LD4;
-  VP8PredLuma4[7] = VL4;
-  VP8PredLuma4[8] = HD4;
-  VP8PredLuma4[9] = HU4;
-
-  VP8PredLuma16[0] = DC16;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
-
-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
-
-  VP8DitherCombine8x8 = DitherCombine8x8;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8TransformWHT = TransformWHT_C;
+  VP8Transform = TransformTwo_C;
+  VP8TransformDC = TransformDC_C;
+  VP8TransformAC3 = TransformAC3_C;
+#endif
+  VP8TransformUV = TransformUV_C;
+  VP8TransformDCUV = TransformDCUV_C;
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8VFilter16 = VFilter16_C;
+  VP8VFilter16i = VFilter16i_C;
+  VP8HFilter16 = HFilter16_C;
+  VP8VFilter8 = VFilter8_C;
+  VP8VFilter8i = VFilter8i_C;
+  VP8SimpleVFilter16 = SimpleVFilter16_C;
+  VP8SimpleHFilter16 = SimpleHFilter16_C;
+  VP8SimpleVFilter16i = SimpleVFilter16i_C;
+  VP8SimpleHFilter16i = SimpleHFilter16i_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8HFilter16i = HFilter16i_C;
+  VP8HFilter8 = HFilter8_C;
+  VP8HFilter8i = HFilter8i_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8PredLuma4[0] = DC4_C;
+  VP8PredLuma4[1] = TM4_C;
+  VP8PredLuma4[2] = VE4_C;
+  VP8PredLuma4[4] = RD4_C;
+  VP8PredLuma4[6] = LD4_C;
+#endif
+
+  VP8PredLuma4[3] = HE4_C;
+  VP8PredLuma4[5] = VR4_C;
+  VP8PredLuma4[7] = VL4_C;
+  VP8PredLuma4[8] = HD4_C;
+  VP8PredLuma4[9] = HU4_C;
+
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8PredLuma16[0] = DC16_C;
+  VP8PredLuma16[1] = TM16_C;
+  VP8PredLuma16[2] = VE16_C;
+  VP8PredLuma16[3] = HE16_C;
+  VP8PredLuma16[4] = DC16NoTop_C;
+  VP8PredLuma16[5] = DC16NoLeft_C;
+  VP8PredLuma16[6] = DC16NoTopLeft_C;
+
+  VP8PredChroma8[0] = DC8uv_C;
+  VP8PredChroma8[1] = TM8uv_C;
+  VP8PredChroma8[2] = VE8uv_C;
+  VP8PredChroma8[3] = HE8uv_C;
+  VP8PredChroma8[4] = DC8uvNoTop_C;
+  VP8PredChroma8[5] = DC8uvNoLeft_C;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_C;
+#endif
+
+  VP8DitherCombine8x8 = DitherCombine8x8_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -770,11 +822,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
 #endif
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8DspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8DspInitMIPS32();
@@ -791,5 +838,57 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8DspInitNEON();
+  }
+#endif
+
+  assert(VP8TransformWHT != NULL);
+  assert(VP8Transform != NULL);
+  assert(VP8TransformDC != NULL);
+  assert(VP8TransformAC3 != NULL);
+  assert(VP8TransformUV != NULL);
+  assert(VP8TransformDCUV != NULL);
+  assert(VP8VFilter16 != NULL);
+  assert(VP8HFilter16 != NULL);
+  assert(VP8VFilter8 != NULL);
+  assert(VP8HFilter8 != NULL);
+  assert(VP8VFilter16i != NULL);
+  assert(VP8HFilter16i != NULL);
+  assert(VP8VFilter8i != NULL);
+  assert(VP8HFilter8i != NULL);
+  assert(VP8SimpleVFilter16 != NULL);
+  assert(VP8SimpleHFilter16 != NULL);
+  assert(VP8SimpleVFilter16i != NULL);
+  assert(VP8SimpleHFilter16i != NULL);
+  assert(VP8PredLuma4[0] != NULL);
+  assert(VP8PredLuma4[1] != NULL);
+  assert(VP8PredLuma4[2] != NULL);
+  assert(VP8PredLuma4[3] != NULL);
+  assert(VP8PredLuma4[4] != NULL);
+  assert(VP8PredLuma4[5] != NULL);
+  assert(VP8PredLuma4[6] != NULL);
+  assert(VP8PredLuma4[7] != NULL);
+  assert(VP8PredLuma4[8] != NULL);
+  assert(VP8PredLuma4[9] != NULL);
+  assert(VP8PredLuma16[0] != NULL);
+  assert(VP8PredLuma16[1] != NULL);
+  assert(VP8PredLuma16[2] != NULL);
+  assert(VP8PredLuma16[3] != NULL);
+  assert(VP8PredLuma16[4] != NULL);
+  assert(VP8PredLuma16[5] != NULL);
+  assert(VP8PredLuma16[6] != NULL);
+  assert(VP8PredChroma8[0] != NULL);
+  assert(VP8PredChroma8[1] != NULL);
+  assert(VP8PredChroma8[2] != NULL);
+  assert(VP8PredChroma8[3] != NULL);
+  assert(VP8PredChroma8[4] != NULL);
+  assert(VP8PredChroma8[5] != NULL);
+  assert(VP8PredChroma8[6] != NULL);
+  assert(VP8DitherCombine8x8 != NULL);
+
   dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/dec_clip_tables.c b/thirdparty/libwebp/src/dsp/dec_clip_tables.c
index 74ba34c0bb..427b74f776 100644
--- a/thirdparty/libwebp/dsp/dec_clip_tables.c
+++ b/thirdparty/libwebp/src/dsp/dec_clip_tables.c
@@ -11,11 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#define USE_STATIC_TABLES     // undefine to have run-time table initialization
+// define to 0 to have run-time table initialization
+#if !defined(USE_STATIC_TABLES)
+#define USE_STATIC_TABLES 1   // ALTERNATE_CODE
+#endif
 
-#ifdef USE_STATIC_TABLES
+#if (USE_STATIC_TABLES == 1)
 
 static const uint8_t abs0[255 + 255 + 1] = {
   0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
@@ -337,7 +340,7 @@ static uint8_t clip1[255 + 511 + 1];
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;
 
-#endif
+#endif    // USE_STATIC_TABLES
 
 const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
 const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
@@ -345,7 +348,7 @@ const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
-#if !defined(USE_STATIC_TABLES)
+#if (USE_STATIC_TABLES == 0)
   int i;
   if (!tables_ok) {
     for (i = -255; i <= 255; ++i) {
diff --git a/thirdparty/libwebp/dsp/dec_mips32.c b/thirdparty/libwebp/src/dsp/dec_mips32.c
index 4e9ef42605..e4e70966d2 100644
--- a/thirdparty/libwebp/dsp/dec_mips32.c
+++ b/thirdparty/libwebp/src/dsp/dec_mips32.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/thirdparty/libwebp/dsp/dec_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
index db5c657228..b0936bc46e 100644
--- a/thirdparty/libwebp/dsp/dec_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./mips_macro.h"
+#include "src/dsp/mips_macro.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/thirdparty/libwebp/dsp/dec_msa.c b/thirdparty/libwebp/src/dsp/dec_msa.c
index 8d9c98c3cf..8090622b7b 100644
--- a/thirdparty/libwebp/dsp/dec_msa.c
+++ b/thirdparty/libwebp/src/dsp/dec_msa.c
@@ -12,11 +12,11 @@
 // Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
 
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"
 
 //------------------------------------------------------------------------------
 // Transforms
@@ -222,6 +222,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const v16i8 cnst4b = __msa_ldi_b(4);                        \
   const v16i8 cnst3b = __msa_ldi_b(3);                        \
   const v8i16 cnst9h = __msa_ldi_h(9);                        \
+  const v8i16 cnst63h = __msa_ldi_h(63);                      \
                                                               \
   FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
   filt = __msa_subs_s_b(p1_m, q1_m);                          \
@@ -241,9 +242,9 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
   /* update q2/p2 */                                          \
   temp0 = filt_r * cnst9h;                                    \
-  temp1 = ADDVI_H(temp0, 63);                                 \
+  temp1 = temp0 + cnst63h;                                    \
   temp2 = filt_l * cnst9h;                                    \
-  temp3 = ADDVI_H(temp2, 63);                                 \
+  temp3 = temp2 + cnst63h;                                    \
   FILT2(q2_m, p2_m, q2, p2);                                  \
   /* update q1/p1 */                                          \
   temp1 = temp1 + temp0;                                      \
@@ -708,7 +709,7 @@ static void VE4(uint8_t* dst) {    // vertical
   const uint32_t val0 = LW(ptop + 0);
   const uint32_t val1 = LW(ptop + 4);
   uint32_t out;
-  v16u8 A, B, C, AC, B2, R;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
 
   INSERT_W2_UB(val0, val1, A);
   B = SLDI_UB(A, A, 1);
@@ -725,7 +726,7 @@ static void RD4(uint8_t* dst) {   // Down-right
   uint32_t val0 = LW(ptop + 0);
   uint32_t val1 = LW(ptop + 4);
   uint32_t val2, val3;
-  v16u8 A, B, C, AC, B2, R, A1;
+  v16u8 A, B, C, AC, B2, R, A1 = { 0 };
 
   INSERT_W2_UB(val0, val1, A1);
   A = SLDI_UB(A1, A1, 12);
@@ -753,7 +754,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
   uint32_t val0 = LW(ptop + 0);
   uint32_t val1 = LW(ptop + 4);
   uint32_t val2, val3;
-  v16u8 A, B, C, AC, B2, R;
+  v16u8 A = { 0 }, B, C, AC, B2, R;
 
   INSERT_W2_UB(val0, val1, A);
   B = SLDI_UB(A, A, 1);
diff --git a/thirdparty/libwebp/dsp/dec_neon.c b/thirdparty/libwebp/src/dsp/dec_neon.c
index 34796cf4a2..ffa697fcf9 100644
--- a/thirdparty/libwebp/dsp/dec_neon.c
+++ b/thirdparty/libwebp/src/dsp/dec_neon.c
@@ -12,43 +12,23 @@
 // Authors: Somnath Banerjee (somnath@google.com)
 //          Johann Koenig (johannkoenig@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
-#include "./neon.h"
-#include "../dec/vp8i_dec.h"
+#include "src/dsp/neon.h"
+#include "src/dec/vp8i_dec.h"
 
 //------------------------------------------------------------------------------
 // NxM Loading functions
 
-// Load/Store vertical edge
-#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
-  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
-  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
-  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
-  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
-  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
-
-#define STORE8x2(c1, c2, p, stride)                                            \
-  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
-  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
-
 #if !defined(WORK_AROUND_GCC)
 
 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
 // (register alloc, probably). The variants somewhat mitigate the problem, but
 // not quite. HFilter16i() remains problematic.
-static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
+static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
+                                            int stride) {
   const uint8x8_t zero = vdup_n_u8(0);
   uint8x8x4_t out;
   INIT_VECTOR4(out, zero, zero, zero, zero);
@@ -63,13 +43,15 @@ static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
   return out;
 }
 
-static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
+                                      uint8x16_t* const p1,
+                                      uint8x16_t* const p0,
+                                      uint8x16_t* const q0,
+                                      uint8x16_t* const q1) {
   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
-  const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
-  const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
+  const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
+  const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
@@ -83,9 +65,11 @@ static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
   src += stride;                                                     \
 } while (0)
 
-static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
+                                      uint8x16_t* const p1,
+                                      uint8x16_t* const p0,
+                                      uint8x16_t* const q0,
+                                      uint8x16_t* const q1) {
   const uint32x4_t zero = vdupq_n_u32(0);
   uint32x4x4_t in;
   INIT_VECTOR4(in, zero, zero, zero, zero);
@@ -126,40 +110,40 @@ static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
 
 #endif  // !WORK_AROUND_GCC
 
-static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p3, uint8x16_t* const p2,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1,
-                                 uint8x16_t* const q2, uint8x16_t* const q3) {
-  Load4x16(src - 2, stride, p3, p2, p1, p0);
-  Load4x16(src + 2, stride, q0, q1, q2, q3);
+static WEBP_INLINE void Load8x16_NEON(
+    const uint8_t* const src, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
+  Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
+  Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
 }
 
-static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1) {
+static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
+                                      uint8x16_t* const p1,
+                                      uint8x16_t* const p0,
+                                      uint8x16_t* const q0,
+                                      uint8x16_t* const q1) {
   *p1 = vld1q_u8(src - 2 * stride);
   *p0 = vld1q_u8(src - 1 * stride);
   *q0 = vld1q_u8(src + 0 * stride);
   *q1 = vld1q_u8(src + 1 * stride);
 }
 
-static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
-                                 uint8x16_t* const p3, uint8x16_t* const p2,
-                                 uint8x16_t* const p1, uint8x16_t* const p0,
-                                 uint8x16_t* const q0, uint8x16_t* const q1,
-                                 uint8x16_t* const q2, uint8x16_t* const q3) {
-  Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
-  Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
+static WEBP_INLINE void Load16x8_NEON(
+    const uint8_t* const src, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
+  Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
+  Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
 }
 
-static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
-                                  const uint8_t* const v,
-                                  int stride,
-                                  uint8x16_t* const p3, uint8x16_t* const p2,
-                                  uint8x16_t* const p1, uint8x16_t* const p0,
-                                  uint8x16_t* const q0, uint8x16_t* const q1,
-                                  uint8x16_t* const q2, uint8x16_t* const q3) {
+static WEBP_INLINE void Load8x8x2_NEON(
+    const uint8_t* const u, const uint8_t* const v, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
   // and the v-samples on the higher half.
   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
@@ -177,13 +161,11 @@ static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
 #define LOAD_UV_8(ROW) \
   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
 
-static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
-                                   const uint8_t* const v,
-                                   int stride,
-                                   uint8x16_t* const p3, uint8x16_t* const p2,
-                                   uint8x16_t* const p1, uint8x16_t* const p0,
-                                   uint8x16_t* const q0, uint8x16_t* const q1,
-                                   uint8x16_t* const q2, uint8x16_t* const q3) {
+static WEBP_INLINE void Load8x8x2T_NEON(
+    const uint8_t* const u, const uint8_t* const v, int stride,
+    uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
+    uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
+    uint8x16_t* const q2, uint8x16_t* const q3) {
   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
   // and the v-samples on the higher half.
   const uint8x16_t row0 = LOAD_UV_8(0);
@@ -238,8 +220,8 @@ static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
 
 #endif  // !WORK_AROUND_GCC
 
-static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
-                                 uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
+                                      uint8_t* const dst, int stride) {
   vst2_lane_u8(dst + 0 * stride, v, 0);
   vst2_lane_u8(dst + 1 * stride, v, 1);
   vst2_lane_u8(dst + 2 * stride, v, 2);
@@ -250,20 +232,20 @@ static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
   vst2_lane_u8(dst + 7 * stride, v, 7);
 }
 
-static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
-                                  uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
+                                       uint8_t* const dst, int stride) {
   uint8x8x2_t lo, hi;
   lo.val[0] = vget_low_u8(p0);
   lo.val[1] = vget_low_u8(q0);
   hi.val[0] = vget_high_u8(p0);
   hi.val[1] = vget_high_u8(q0);
-  Store2x8(lo, dst - 1 + 0 * stride, stride);
-  Store2x8(hi, dst - 1 + 8 * stride, stride);
+  Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
+  Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
 }
 
 #if !defined(WORK_AROUND_GCC)
-static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
-                                 uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
+                                      uint8_t* const dst, int stride) {
   vst4_lane_u8(dst + 0 * stride, v, 0);
   vst4_lane_u8(dst + 1 * stride, v, 1);
   vst4_lane_u8(dst + 2 * stride, v, 2);
@@ -274,9 +256,9 @@ static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
   vst4_lane_u8(dst + 7 * stride, v, 7);
 }
 
-static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
-                                  const uint8x16_t q0, const uint8x16_t q1,
-                                  uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                       const uint8x16_t q0, const uint8x16_t q1,
+                                       uint8_t* const dst, int stride) {
   uint8x8x4_t lo, hi;
   INIT_VECTOR4(lo,
                vget_low_u8(p1), vget_low_u8(p0),
@@ -284,27 +266,28 @@ static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
   INIT_VECTOR4(hi,
                vget_high_u8(p1), vget_high_u8(p0),
                vget_high_u8(q0), vget_high_u8(q1));
-  Store4x8(lo, dst - 2 + 0 * stride, stride);
-  Store4x8(hi, dst - 2 + 8 * stride, stride);
+  Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
+  Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
 }
 #endif  // !WORK_AROUND_GCC
 
-static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
-                                  uint8_t* const dst, int stride) {
+static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
+                                       uint8_t* const dst, int stride) {
   vst1q_u8(dst - stride, p0);
   vst1q_u8(dst, q0);
 }
 
-static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
-                                  const uint8x16_t q0, const uint8x16_t q1,
-                                  uint8_t* const dst, int stride) {
-  Store16x2(p1, p0, dst - stride, stride);
-  Store16x2(q0, q1, dst + stride, stride);
+static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                       const uint8x16_t q0, const uint8x16_t q1,
+                                       uint8_t* const dst, int stride) {
+  Store16x2_NEON(p1, p0, dst - stride, stride);
+  Store16x2_NEON(q0, q1, dst + stride, stride);
 }
 
-static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
-                                   uint8_t* const u, uint8_t* const v,
-                                   int stride) {
+static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
+                                        const uint8x16_t q0,
+                                        uint8_t* const u, uint8_t* const v,
+                                        int stride) {
   // p0 and q0 contain the u+v samples packed in low/high halves.
   vst1_u8(u - stride, vget_low_u8(p0));
   vst1_u8(u,          vget_low_u8(q0));
@@ -312,13 +295,15 @@ static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
   vst1_u8(v,          vget_high_u8(q0));
 }
 
-static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
-                                   const uint8x16_t q0, const uint8x16_t q1,
-                                   uint8_t* const u, uint8_t* const v,
-                                   int stride) {
+static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
+                                        const uint8x16_t p0,
+                                        const uint8x16_t q0,
+                                        const uint8x16_t q1,
+                                        uint8_t* const u, uint8_t* const v,
+                                        int stride) {
   // The p1...q1 registers contain the u+v samples packed in low/high halves.
-  Store8x2x2(p1, p0, u - stride, v - stride, stride);
-  Store8x2x2(q0, q1, u + stride, v + stride, stride);
+  Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
+  Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
 }
 
 #if !defined(WORK_AROUND_GCC)
@@ -329,11 +314,10 @@ static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
   (DST) += stride;                                \
 } while (0)
 
-static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
-                                   const uint8x16_t p0, const uint8x16_t q0,
-                                   const uint8x16_t q1, const uint8x16_t q2,
-                                   uint8_t* u, uint8_t* v,
-                                   int stride) {
+static WEBP_INLINE void Store6x8x2_NEON(
+    const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
+    const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
+    uint8_t* u, uint8_t* v, int stride) {
   uint8x8x3_t u0, u1, v0, v1;
   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
@@ -358,10 +342,12 @@ static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
 }
 #undef STORE6_LANE
 
-static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
-                                   const uint8x16_t q0, const uint8x16_t q1,
-                                   uint8_t* const u, uint8_t* const v,
-                                   int stride) {
+static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
+                                        const uint8x16_t p0,
+                                        const uint8x16_t q0,
+                                        const uint8x16_t q1,
+                                        uint8_t* const u, uint8_t* const v,
+                                        int stride) {
   uint8x8x4_t u0, v0;
   INIT_VECTOR4(u0,
                vget_low_u8(p1), vget_low_u8(p0),
@@ -390,15 +376,15 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
 #endif  // !WORK_AROUND_GCC
 
 // Zero extend 'v' to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
   return vreinterpretq_s16_u16(vmovl_u8(v));
 }
 
 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 // to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
-                                            const int16x8_t dst01,
-                                            const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+                                                 const int16x8_t dst01,
+                                                 const int16x8_t dst23) {
   // Unsigned saturate to 8b.
   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@@ -410,8 +396,9 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 }
 
-static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
-                               uint8_t* const dst) {
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+                                    const int16x8_t row23,
+                                    uint8_t* const dst) {
   uint32x2_t dst01 = vdup_n_u32(0);
   uint32x2_t dst23 = vdup_n_u32(0);
 
@@ -423,23 +410,23 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
 
   {
     // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
-    const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
+    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
+    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
 
     // Descale with rounding.
     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
     // Add the inverse transform.
-    SaturateAndStore4x4(dst, out01, out23);
+    SaturateAndStore4x4_NEON(dst, out01, out23);
   }
 }
 
 //-----------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
-static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
-                              const uint8x16_t q0, const uint8x16_t q1,
-                              int thresh) {
+static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                   const uint8x16_t q0, const uint8x16_t q1,
+                                   int thresh) {
   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
@@ -450,18 +437,18 @@ static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
   return mask;
 }
 
-static int8x16_t FlipSign(const uint8x16_t v) {
+static int8x16_t FlipSign_NEON(const uint8x16_t v) {
   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
 }
 
-static uint8x16_t FlipSignBack(const int8x16_t v) {
+static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
   const int8x16_t sign_bit = vdupq_n_s8(0x80);
   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
 }
 
-static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
-                              const int8x16_t q0, const int8x16_t q1) {
+static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
+                                   const int8x16_t q0, const int8x16_t q1) {
   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
@@ -470,7 +457,7 @@ static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
   return s3;
 }
 
-static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
+static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
@@ -479,9 +466,10 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
 
 //------------------------------------------------------------------------------
 
-static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
-                               const int8x16_t delta,
-                               int8x16_t* const op0, int8x16_t* const oq0) {
+static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
+                                    const int8x16_t delta,
+                                    int8x16_t* const op0,
+                                    int8x16_t* const oq0) {
   const int8x16_t kCst3 = vdupq_n_s8(0x03);
   const int8x16_t kCst4 = vdupq_n_s8(0x04);
   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
@@ -494,9 +482,9 @@ static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
 
 #if defined(WEBP_USE_INTRINSICS)
 
-static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
-                         const int8x16_t delta,
-                         uint8x16_t* const op0, uint8x16_t* const oq0) {
+static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
+                              const int8x16_t delta,
+                              uint8x16_t* const op0, uint8x16_t* const oq0) {
   const int8x16_t kCst3 = vdupq_n_s8(0x03);
   const int8x16_t kCst4 = vdupq_n_s8(0x04);
   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
@@ -505,45 +493,66 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
-  *op0 = FlipSignBack(sp0);
-  *oq0 = FlipSignBack(sq0);
-}
-
-static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
-                      const uint8x16_t q0, const uint8x16_t q1,
-                      const uint8x16_t mask,
-                      uint8x16_t* const op0, uint8x16_t* const oq0) {
-  const int8x16_t p1s = FlipSign(p1);
-  const int8x16_t p0s = FlipSign(p0);
-  const int8x16_t q0s = FlipSign(q0);
-  const int8x16_t q1s = FlipSign(q1);
-  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+  *op0 = FlipSignBack_NEON(sp0);
+  *oq0 = FlipSignBack_NEON(sq0);
+}
+
+static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                           const uint8x16_t q0, const uint8x16_t q1,
+                           const uint8x16_t mask,
+                           uint8x16_t* const op0, uint8x16_t* const oq0) {
+  const int8x16_t p1s = FlipSign_NEON(p1);
+  const int8x16_t p0s = FlipSign_NEON(p0);
+  const int8x16_t q0s = FlipSign_NEON(q0);
+  const int8x16_t q1s = FlipSign_NEON(q1);
+  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
-  ApplyFilter2(p0s, q0s, delta1, op0, oq0);
+  ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
 }
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
   uint8x16_t p1, p0, q0, q1, op0, oq0;
-  Load16x4(p, stride, &p1, &p0, &q0, &q1);
+  Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
   {
-    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
-    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
+    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
   }
-  Store16x2(op0, oq0, p, stride);
+  Store16x2_NEON(op0, oq0, p, stride);
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
   uint8x16_t p1, p0, q0, q1, oq0, op0;
-  Load4x16(p, stride, &p1, &p0, &q0, &q1);
+  Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
   {
-    const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
-    DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
+    const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
+    DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
   }
-  Store2x16(op0, oq0, p, stride);
+  Store2x16_NEON(op0, oq0, p, stride);
 }
 
 #else
 
+// Load/Store vertical edge
+#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
+  "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
+  "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
+  "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
+
+#define STORE8x2(c1, c2, p, stride)                                            \
+  "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
+  "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
+
 #define QRegs "q0", "q1", "q2", "q3",                                          \
               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
 
@@ -592,7 +601,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
   FLIP_SIGN_BIT2(p0, q0, q10)
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
   __asm__ volatile (
     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 
@@ -613,7 +622,7 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
   );
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
   __asm__ volatile (
     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
@@ -639,30 +648,33 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   );
 }
 
+#undef LOAD8x4
+#undef STORE8x2
+
 #endif    // WEBP_USE_INTRINSICS
 
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
   uint32_t k;
   for (k = 3; k != 0; --k) {
     p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_NEON(p, stride, thresh);
   }
 }
 
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
   uint32_t k;
   for (k = 3; k != 0; --k) {
     p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_NEON(p, stride, thresh);
   }
 }
 
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 
-static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
-                           const uint8x16_t q0, const uint8x16_t q1,
-                           int hev_thresh) {
+static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
+                                const uint8x16_t q0, const uint8x16_t q1,
+                                int hev_thresh) {
   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
@@ -671,11 +683,11 @@ static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
   return mask;
 }
 
-static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
-                               const uint8x16_t p1, const uint8x16_t p0,
-                               const uint8x16_t q0, const uint8x16_t q1,
-                               const uint8x16_t q2, const uint8x16_t q3,
-                               int ithresh, int thresh) {
+static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
+                                    const uint8x16_t p1, const uint8x16_t p0,
+                                    const uint8x16_t q0, const uint8x16_t q1,
+                                    const uint8x16_t q2, const uint8x16_t q3,
+                                    int ithresh, int thresh) {
   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
@@ -689,14 +701,14 @@ static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
   const uint8x16_t max12 = vmaxq_u8(max1, max2);
   const uint8x16_t max123 = vmaxq_u8(max12, max3);
   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
-  const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
+  const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
   const uint8x16_t mask = vandq_u8(mask1, mask2);
   return mask;
 }
 
 //  4-points filter
 
-static void ApplyFilter4(
+static void ApplyFilter4_NEON(
     const int8x16_t p1, const int8x16_t p0,
     const int8x16_t q0, const int8x16_t q1,
     const int8x16_t delta0,
@@ -709,47 +721,47 @@ static void ApplyFilter4(
   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
-  *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
-  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
-  *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
-  *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
+  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
+  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
+  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
+  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
 }
 
-static void DoFilter4(
+static void DoFilter4_NEON(
     const uint8x16_t p1, const uint8x16_t p0,
     const uint8x16_t q0, const uint8x16_t q1,
     const uint8x16_t mask, const uint8x16_t hev_mask,
     uint8x16_t* const op1, uint8x16_t* const op0,
     uint8x16_t* const oq0, uint8x16_t* const oq1) {
   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
-  const int8x16_t p1s = FlipSign(p1);
-  int8x16_t p0s = FlipSign(p0);
-  int8x16_t q0s = FlipSign(q0);
-  const int8x16_t q1s = FlipSign(q1);
+  const int8x16_t p1s = FlipSign_NEON(p1);
+  int8x16_t p0s = FlipSign_NEON(p0);
+  int8x16_t q0s = FlipSign_NEON(q0);
+  const int8x16_t q1s = FlipSign_NEON(q1);
   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
 
   // do_filter2 part (simple loopfilter on pixels with hev)
   {
-    const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
+    const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
     const int8x16_t simple_lf_delta =
         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
-    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
   }
 
   // do_filter4 part (complex loopfilter on pixels without hev)
   {
-    const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
+    const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
     const int8x16_t complex_lf_delta =
         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
-    ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
+    ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
   }
 }
 
 //  6-points filter
 
-static void ApplyFilter6(
+static void ApplyFilter6_NEON(
     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
     const int8x16_t delta,
@@ -778,35 +790,35 @@ static void ApplyFilter6(
   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
 
-  *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
-  *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
-  *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
-  *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
-  *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
-  *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
+  *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
+  *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
+  *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
+  *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
+  *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
+  *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
 }
 
-static void DoFilter6(
+static void DoFilter6_NEON(
     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
     const uint8x16_t mask, const uint8x16_t hev_mask,
     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
-  const int8x16_t p2s = FlipSign(p2);
-  const int8x16_t p1s = FlipSign(p1);
-  int8x16_t p0s = FlipSign(p0);
-  int8x16_t q0s = FlipSign(q0);
-  const int8x16_t q1s = FlipSign(q1);
-  const int8x16_t q2s = FlipSign(q2);
+  const int8x16_t p2s = FlipSign_NEON(p2);
+  const int8x16_t p1s = FlipSign_NEON(p1);
+  int8x16_t p0s = FlipSign_NEON(p0);
+  int8x16_t q0s = FlipSign_NEON(q0);
+  const int8x16_t q1s = FlipSign_NEON(q1);
+  const int8x16_t q2s = FlipSign_NEON(q2);
   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
-  const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
+  const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
 
   // do_filter2 part (simple loopfilter on pixels with hev)
   {
     const int8x16_t simple_lf_delta =
         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
-    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
+    ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
   }
 
   // do_filter6 part (complex loopfilter on pixels without hev)
@@ -815,65 +827,65 @@ static void DoFilter6(
     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
     const int8x16_t complex_lf_delta =
         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
-    ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
-                 op2, op1, op0, oq0, oq1, oq2);
+    ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
+                      op2, op1, op0, oq0, oq1, oq2);
   }
 }
 
 // on macroblock edges
 
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter16_NEON(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store16x2(op2, op1, p - 2 * stride, stride);
-    Store16x2(op0, oq0, p + 0 * stride, stride);
-    Store16x2(oq1, oq2, p + 2 * stride, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store16x2_NEON(op2, op1, p - 2 * stride, stride);
+    Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
+    Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
   }
 }
 
-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter16_NEON(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store2x16(op2, op1, p - 2, stride);
-    Store2x16(op0, oq0, p + 0, stride);
-    Store2x16(oq1, oq2, p + 2, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store2x16_NEON(op2, op1, p - 2, stride);
+    Store2x16_NEON(op0, oq0, p + 0, stride);
+    Store2x16_NEON(oq1, oq2, p + 2, stride);
   }
 }
 
 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_NEON(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   uint32_t k;
   uint8x16_t p3, p2, p1, p0;
-  Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
+  Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
   for (k = 3; k != 0; --k) {
     uint8x16_t q0, q1, q2, q3;
     p += 4 * stride;
-    Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
+    Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
     {
       const uint8x16_t mask =
-          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
-      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
       // p3 and p2 are not just temporary variables here: they will be
       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
-      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
-      Store16x4(p1, p0, p3, p2, p, stride);
+      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+      Store16x4_NEON(p1, p0, p3, p2, p, stride);
       p1 = q2;
       p0 = q3;
     }
@@ -881,21 +893,21 @@ static void VFilter16i(uint8_t* p, int stride,
 }
 
 #if !defined(WORK_AROUND_GCC)
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i_NEON(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   uint32_t k;
   uint8x16_t p3, p2, p1, p0;
-  Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
+  Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
   for (k = 3; k != 0; --k) {
     uint8x16_t q0, q1, q2, q3;
     p += 4;
-    Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
+    Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
     {
       const uint8x16_t mask =
-          NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
-      const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
-      DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
-      Store4x16(p1, p0, p3, p2, p, stride);
+          NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
+      const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
+      DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
+      Store4x16_NEON(p1, p0, p3, p2, p, stride);
       p1 = q2;
       p0 = q3;
     }
@@ -904,67 +916,67 @@ static void HFilter16i(uint8_t* p, int stride,
 #endif  // !WORK_AROUND_GCC
 
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
-    Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
-    Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
+    Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
+    Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
   }
 }
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   u += 4 * stride;
   v += 4 * stride;
-  Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op1, op0, oq0, oq1;
-    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
-    Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
+    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+    Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
   }
 }
 
 #if !defined(WORK_AROUND_GCC)
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
-  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
-    DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
-              &op2, &op1, &op0, &oq0, &oq1, &oq2);
-    Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
+    DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
+                   &op2, &op1, &op0, &oq0, &oq1, &oq2);
+    Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
   }
 }
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
   u += 4;
   v += 4;
-  Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
+  Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
   {
-    const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
-                                         ithresh, thresh);
-    const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
+    const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
+                                              ithresh, thresh);
+    const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
     uint8x16_t op1, op0, oq0, oq1;
-    DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
-    Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
+    DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
+    Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
   }
 }
 #endif  // !WORK_AROUND_GCC
@@ -992,8 +1004,9 @@ static const int16_t kC1 = 20091;
 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 
 #if defined(WEBP_USE_INTRINSICS)
-static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
-                                     int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+                                          const int16x8_t in1,
+                                          int16x8x2_t* const out) {
   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
@@ -1001,7 +1014,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 }
 
-static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   // {rows} = in0 | in4
   //          in8 | in12
   // B1 = in4 | in12
@@ -1024,20 +1037,20 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
-  Transpose8x2(E0, E1, rows);
+  Transpose8x2_NEON(E0, E1, rows);
 }
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
   int16x8x2_t rows;
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
-  TransformPass(&rows);
-  TransformPass(&rows);
-  Add4x4(rows.val[0], rows.val[1], dst);
+  TransformPass_NEON(&rows);
+  TransformPass_NEON(&rows);
+  Add4x4_NEON(rows.val[0], rows.val[1], dst);
 }
 
 #else
 
-static void TransformOne(const int16_t* in, uint8_t* dst) {
+static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
   const int kBPS = BPS;
   // kC1, kC2. Padded because vld1.16 loads 8 bytes
   const int16_t constants[4] = { kC1, kC2, 0, 0 };
@@ -1170,16 +1183,16 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 
 #endif    // WEBP_USE_INTRINSICS
 
-static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOne(in, dst);
+static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne_NEON(in, dst);
   if (do_two) {
-    TransformOne(in + 16, dst + 4);
+    TransformOne_NEON(in + 16, dst + 4);
   }
 }
 
-static void TransformDC(const int16_t* in, uint8_t* dst) {
+static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
   const int16x8_t DC = vdupq_n_s16(in[0]);
-  Add4x4(DC, DC, dst);
+  Add4x4_NEON(DC, DC, dst);
 }
 
 //------------------------------------------------------------------------------
@@ -1191,7 +1204,7 @@ static void TransformDC(const int16_t* in, uint8_t* dst) {
   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
 } while (0)
 
-static void TransformWHT(const int16_t* in, int16_t* out) {
+static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
   int32x4x4_t tmp;
 
   {
@@ -1209,7 +1222,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
     tmp.val[2] = vsubq_s32(a0, a1);
     tmp.val[3] = vsubq_s32(a3, a2);
     // Arrange the temporary results column-wise.
-    tmp = Transpose4x4(tmp);
+    tmp = Transpose4x4_NEON(tmp);
   }
 
   {
@@ -1243,7 +1256,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
 //------------------------------------------------------------------------------
 
 #define MUL(a, b) (((a) * (b)) >> 16)
-static void TransformAC3(const int16_t* in, uint8_t* dst) {
+static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
   static const int kC1_full = 20091 + (1 << 16);
   static const int kC2_full = 35468;
   const int16x4_t A = vld1_dup_s16(in);
@@ -1259,14 +1272,14 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const int16x4_t B = vqadd_s16(A, CD);
   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
-  Add4x4(m0_m1, m2_m3, dst);
+  Add4x4_NEON(m0_m1, m2_m3, dst);
 }
 #undef MUL
 
 //------------------------------------------------------------------------------
 // 4x4
 
-static void DC4(uint8_t* dst) {    // DC
+static void DC4_NEON(uint8_t* dst) {    // DC
   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   const uint16x4_t p1 = vpadd_u16(p0, p0);
@@ -1287,17 +1300,17 @@ static void DC4(uint8_t* dst) {    // DC
 }
 
 // TrueMotion (4x4 + 8x8)
-static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
   const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
   int y;
   for (y = 0; y < size; y += 4) {
     // left edge
-    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
     const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
     const int16x8_t r1 = vaddq_s16(L1, d);
     const int16x8_t r2 = vaddq_s16(L2, d);
@@ -1322,9 +1335,9 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   }
 }
 
-static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
 
-static void VE4(uint8_t* dst) {    // vertical
+static void VE4_NEON(uint8_t* dst) {    // vertical
   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
   const uint64x1_t A1 = vshr_n_u64(A0, 8);
@@ -1340,7 +1353,7 @@ static void VE4(uint8_t* dst) {    // vertical
   }
 }
 
-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_NEON(uint8_t* dst) {   // Down-right
   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
@@ -1368,7 +1381,7 @@ static void RD4(uint8_t* dst) {   // Down-right
   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
 }
 
-static void LD4(uint8_t* dst) {    // Down-left
+static void LD4_NEON(uint8_t* dst) {    // Down-left
   // Note using the same shift trick as VE4() is slower here.
   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
@@ -1390,7 +1403,7 @@ static void LD4(uint8_t* dst) {    // Down-left
 //------------------------------------------------------------------------------
 // Chroma
 
-static void VE8uv(uint8_t* dst) {    // vertical
+static void VE8uv_NEON(uint8_t* dst) {    // vertical
   const uint8x8_t top = vld1_u8(dst - BPS);
   int j;
   for (j = 0; j < 8; ++j) {
@@ -1398,7 +1411,7 @@ static void VE8uv(uint8_t* dst) {    // vertical
   }
 }
 
-static void HE8uv(uint8_t* dst) {    // horizontal
+static void HE8uv_NEON(uint8_t* dst) {    // horizontal
   int j;
   for (j = 0; j < 8; ++j) {
     const uint8x8_t left = vld1_dup_u8(dst - 1);
@@ -1407,7 +1420,7 @@ static void HE8uv(uint8_t* dst) {    // horizontal
   }
 }
 
-static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
+static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
   uint16x8_t sum_top;
   uint16x8_t sum_left;
   uint8x8_t dc0;
@@ -1458,17 +1471,17 @@ static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
   }
 }
 
-static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
-static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
-static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
-static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
+static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
+static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
+static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
+static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
 
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
 
 //------------------------------------------------------------------------------
 // 16x16
 
-static void VE16(uint8_t* dst) {     // vertical
+static void VE16_NEON(uint8_t* dst) {     // vertical
   const uint8x16_t top = vld1q_u8(dst - BPS);
   int j;
   for (j = 0; j < 16; ++j) {
@@ -1476,7 +1489,7 @@ static void VE16(uint8_t* dst) {     // vertical
   }
 }
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_NEON(uint8_t* dst) {     // horizontal
   int j;
   for (j = 0; j < 16; ++j) {
     const uint8x16_t left = vld1q_dup_u8(dst - 1);
@@ -1485,7 +1498,7 @@ static void HE16(uint8_t* dst) {     // horizontal
   }
 }
 
-static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
+static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
   uint16x8_t sum_top;
   uint16x8_t sum_left;
   uint8x8_t dc0;
@@ -1542,12 +1555,12 @@ static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
   }
 }
 
-static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
-static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
-static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
-static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
+static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
+static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
+static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
+static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
 
-static void TM16(uint8_t* dst) {
+static void TM16_NEON(uint8_t* dst) {
   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
   // A[c] - A[-1]
@@ -1556,10 +1569,10 @@ static void TM16(uint8_t* dst) {
   int y;
   for (y = 0; y < 16; y += 4) {
     // left edge
-    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
-    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
-    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
-    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
     const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
     const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
     const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
@@ -1587,49 +1600,49 @@ static void TM16(uint8_t* dst) {
 extern void VP8DspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
-  VP8Transform = TransformTwo;
-  VP8TransformAC3 = TransformAC3;
-  VP8TransformDC = TransformDC;
-  VP8TransformWHT = TransformWHT;
-
-  VP8VFilter16 = VFilter16;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16 = HFilter16;
+  VP8Transform = TransformTwo_NEON;
+  VP8TransformAC3 = TransformAC3_NEON;
+  VP8TransformDC = TransformDC_NEON;
+  VP8TransformWHT = TransformWHT_NEON;
+
+  VP8VFilter16 = VFilter16_NEON;
+  VP8VFilter16i = VFilter16i_NEON;
+  VP8HFilter16 = HFilter16_NEON;
 #if !defined(WORK_AROUND_GCC)
-  VP8HFilter16i = HFilter16i;
+  VP8HFilter16i = HFilter16i_NEON;
 #endif
-  VP8VFilter8 = VFilter8;
-  VP8VFilter8i = VFilter8i;
+  VP8VFilter8 = VFilter8_NEON;
+  VP8VFilter8i = VFilter8i_NEON;
 #if !defined(WORK_AROUND_GCC)
-  VP8HFilter8 = HFilter8;
-  VP8HFilter8i = HFilter8i;
+  VP8HFilter8 = HFilter8_NEON;
+  VP8HFilter8i = HFilter8i_NEON;
 #endif
-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
-
-  VP8PredLuma4[0] = DC4;
-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[6] = LD4;
-
-  VP8PredLuma16[0] = DC16TopLeft;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
-
-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
+  VP8SimpleVFilter16 = SimpleVFilter16_NEON;
+  VP8SimpleHFilter16 = SimpleHFilter16_NEON;
+  VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
+  VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
+
+  VP8PredLuma4[0] = DC4_NEON;
+  VP8PredLuma4[1] = TM4_NEON;
+  VP8PredLuma4[2] = VE4_NEON;
+  VP8PredLuma4[4] = RD4_NEON;
+  VP8PredLuma4[6] = LD4_NEON;
+
+  VP8PredLuma16[0] = DC16TopLeft_NEON;
+  VP8PredLuma16[1] = TM16_NEON;
+  VP8PredLuma16[2] = VE16_NEON;
+  VP8PredLuma16[3] = HE16_NEON;
+  VP8PredLuma16[4] = DC16NoTop_NEON;
+  VP8PredLuma16[5] = DC16NoLeft_NEON;
+  VP8PredLuma16[6] = DC16NoTopLeft_NEON;
+
+  VP8PredChroma8[0] = DC8uv_NEON;
+  VP8PredChroma8[1] = TM8uv_NEON;
+  VP8PredChroma8[2] = VE8uv_NEON;
+  VP8PredChroma8[3] = HE8uv_NEON;
+  VP8PredChroma8[4] = DC8uvNoTop_NEON;
+  VP8PredChroma8[5] = DC8uvNoLeft_NEON;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
 }
 
 #else  // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/dec_sse2.c b/thirdparty/libwebp/src/dsp/dec_sse2.c
index 411fb02768..b3840faf3a 100644
--- a/thirdparty/libwebp/dsp/dec_sse2.c
+++ b/thirdparty/libwebp/src/dsp/dec_sse2.c
@@ -12,23 +12,25 @@
 // Author: somnath@google.com (Somnath Banerjee)
 //         cduvivier@google.com (Christian Duvivier)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
 // The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
 // one it seems => disable it by default. Uncomment the following to enable:
-// #define USE_TRANSFORM_AC3
+#if !defined(USE_TRANSFORM_AC3)
+#define USE_TRANSFORM_AC3 0   // ALTERNATE_CODE
+#endif
 
 #include <emmintrin.h>
-#include "./common_sse2.h"
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
-static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
+static void Transform_SSE2(const int16_t* in, uint8_t* dst, int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -193,7 +195,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
   }
 }
 
-#if defined(USE_TRANSFORM_AC3)
+#if (USE_TRANSFORM_AC3 == 1)
 #define MUL(a, b) (((a) * (b)) >> 16)
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   static const int kC1 = 20091 + (1 << 16);
@@ -248,7 +250,7 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
     _mm_subs_epu8((p), (q)))
 
 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
-static WEBP_INLINE void SignedShift8b(__m128i* const x) {
+static WEBP_INLINE void SignedShift8b_SSE2(__m128i* const x) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
   const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
@@ -258,8 +260,8 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
 }
 
 #define FLIP_SIGN_BIT2(a, b) {                                                 \
-  a = _mm_xor_si128(a, sign_bit);                                              \
-  b = _mm_xor_si128(b, sign_bit);                                              \
+  (a) = _mm_xor_si128(a, sign_bit);                                            \
+  (b) = _mm_xor_si128(b, sign_bit);                                            \
 }
 
 #define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
@@ -268,11 +270,11 @@ static WEBP_INLINE void SignedShift8b(__m128i* const x) {
 }
 
 // input/output is uint8_t
-static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
-                                  const __m128i* const p0,
-                                  const __m128i* const q0,
-                                  const __m128i* const q1,
-                                  int hev_thresh, __m128i* const not_hev) {
+static WEBP_INLINE void GetNotHEV_SSE2(const __m128i* const p1,
+                                       const __m128i* const p0,
+                                       const __m128i* const q0,
+                                       const __m128i* const q1,
+                                       int hev_thresh, __m128i* const not_hev) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i t_1 = MM_ABS(*p1, *p0);
   const __m128i t_2 = MM_ABS(*q1, *q0);
@@ -285,11 +287,11 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
 }
 
 // input pixels are int8_t
-static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
-                                     const __m128i* const p0,
-                                     const __m128i* const q0,
-                                     const __m128i* const q1,
-                                     __m128i* const delta) {
+static WEBP_INLINE void GetBaseDelta_SSE2(const __m128i* const p1,
+                                          const __m128i* const p0,
+                                          const __m128i* const q0,
+                                          const __m128i* const q1,
+                                          __m128i* const delta) {
   // beware of addition order, for saturation!
   const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
   const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
@@ -300,15 +302,16 @@ static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
 }
 
 // input and output are int8_t
-static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
-                                       const __m128i* const fl) {
+static WEBP_INLINE void DoSimpleFilter_SSE2(__m128i* const p0,
+                                            __m128i* const q0,
+                                            const __m128i* const fl) {
   const __m128i k3 = _mm_set1_epi8(3);
   const __m128i k4 = _mm_set1_epi8(4);
   __m128i v3 = _mm_adds_epi8(*fl, k3);
   __m128i v4 = _mm_adds_epi8(*fl, k4);
 
-  SignedShift8b(&v4);                  // v4 >> 3
-  SignedShift8b(&v3);                  // v3 >> 3
+  SignedShift8b_SSE2(&v4);             // v4 >> 3
+  SignedShift8b_SSE2(&v3);             // v3 >> 3
   *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
   *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
 }
@@ -317,9 +320,9 @@ static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
 // Update operations:
 // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
 // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
-static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
-                                      const __m128i* const a0_lo,
-                                      const __m128i* const a0_hi) {
+static WEBP_INLINE void Update2Pixels_SSE2(__m128i* const pi, __m128i* const qi,
+                                           const __m128i* const a0_lo,
+                                           const __m128i* const a0_hi) {
   const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
   const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
   const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
@@ -330,11 +333,11 @@ static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
 }
 
 // input pixels are uint8_t
-static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
-                                    const __m128i* const p0,
-                                    const __m128i* const q0,
-                                    const __m128i* const q1,
-                                    int thresh, __m128i* const mask) {
+static WEBP_INLINE void NeedsFilter_SSE2(const __m128i* const p1,
+                                         const __m128i* const p0,
+                                         const __m128i* const q0,
+                                         const __m128i* const q1,
+                                         int thresh, __m128i* const mask) {
   const __m128i m_thresh = _mm_set1_epi8(thresh);
   const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
   const __m128i kFE = _mm_set1_epi8(0xFE);
@@ -353,28 +356,29 @@ static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
 // Edge filtering functions
 
 // Applies filter on 2 pixels (p0 and q0)
-static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
-                                  __m128i* const q0, __m128i* const q1,
-                                  int thresh) {
+static WEBP_INLINE void DoFilter2_SSE2(__m128i* const p1, __m128i* const p0,
+                                       __m128i* const q0, __m128i* const q1,
+                                       int thresh) {
   __m128i a, mask;
   const __m128i sign_bit = _mm_set1_epi8(0x80);
-  // convert p1/q1 to int8_t (for GetBaseDelta)
+  // convert p1/q1 to int8_t (for GetBaseDelta_SSE2)
   const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
   const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
 
-  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
+  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &mask);
 
   FLIP_SIGN_BIT2(*p0, *q0);
-  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
+  GetBaseDelta_SSE2(&p1s, p0, q0, &q1s, &a);
   a = _mm_and_si128(a, mask);     // mask filter values we don't care about
-  DoSimpleFilter(p0, q0, &a);
+  DoSimpleFilter_SSE2(p0, q0, &a);
   FLIP_SIGN_BIT2(*p0, *q0);
 }
 
 // Applies filter on 4 pixels (p1, p0, q0 and q1)
-static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
-                                  __m128i* const q0, __m128i* const q1,
-                                  const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter4_SSE2(__m128i* const p1, __m128i* const p0,
+                                       __m128i* const q0, __m128i* const q1,
+                                       const __m128i* const mask,
+                                       int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bit = _mm_set1_epi8(0x80);
   const __m128i k64 = _mm_set1_epi8(64);
@@ -384,7 +388,7 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
   __m128i t1, t2, t3;
 
   // compute hev mask
-  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
 
   // convert to signed values
   FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
@@ -399,8 +403,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
 
   t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
   t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
-  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
-  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
+  SignedShift8b_SSE2(&t2);           // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
+  SignedShift8b_SSE2(&t3);           // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
   *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
   *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
   FLIP_SIGN_BIT2(*p0, *q0);
@@ -417,25 +421,26 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
 }
 
 // Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
-static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
-                                  __m128i* const p0, __m128i* const q0,
-                                  __m128i* const q1, __m128i* const q2,
-                                  const __m128i* const mask, int hev_thresh) {
+static WEBP_INLINE void DoFilter6_SSE2(__m128i* const p2, __m128i* const p1,
+                                       __m128i* const p0, __m128i* const q0,
+                                       __m128i* const q1, __m128i* const q2,
+                                       const __m128i* const mask,
+                                       int hev_thresh) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i sign_bit = _mm_set1_epi8(0x80);
   __m128i a, not_hev;
 
   // compute hev mask
-  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
+  GetNotHEV_SSE2(p1, p0, q0, q1, hev_thresh, &not_hev);
 
   FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
   FLIP_SIGN_BIT2(*p2, *q2);
-  GetBaseDelta(p1, p0, q0, q1, &a);
+  GetBaseDelta_SSE2(p1, p0, q0, q1, &a);
 
   { // do simple filter on pixels with hev
     const __m128i m = _mm_andnot_si128(not_hev, *mask);
     const __m128i f = _mm_and_si128(a, m);
-    DoSimpleFilter(p0, q0, &f);
+    DoSimpleFilter_SSE2(p0, q0, &f);
   }
 
   { // do strong filter on pixels with not hev
@@ -460,15 +465,15 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
     const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
     const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63
 
-    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
-    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
-    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
+    Update2Pixels_SSE2(p2, q2, &a2_lo, &a2_hi);
+    Update2Pixels_SSE2(p1, q1, &a1_lo, &a1_hi);
+    Update2Pixels_SSE2(p0, q0, &a0_lo, &a0_hi);
   }
 }
 
 // reads 8 rows across a vertical edge.
-static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
-                                __m128i* const p, __m128i* const q) {
+static WEBP_INLINE void Load8x4_SSE2(const uint8_t* const b, int stride,
+                                     __m128i* const p, __m128i* const q) {
   // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
   // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
   const __m128i A0 = _mm_set_epi32(
@@ -494,11 +499,11 @@ static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
   *q = _mm_unpackhi_epi32(C0, C1);
 }
 
-static WEBP_INLINE void Load16x4(const uint8_t* const r0,
-                                 const uint8_t* const r8,
-                                 int stride,
-                                 __m128i* const p1, __m128i* const p0,
-                                 __m128i* const q0, __m128i* const q1) {
+static WEBP_INLINE void Load16x4_SSE2(const uint8_t* const r0,
+                                      const uint8_t* const r8,
+                                      int stride,
+                                      __m128i* const p1, __m128i* const p0,
+                                      __m128i* const q0, __m128i* const q1) {
   // Assume the pixels around the edge (|) are numbered as follows
   //                00 01 | 02 03
   //                10 11 | 12 13
@@ -514,8 +519,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
   // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
   // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
   // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-  Load8x4(r0, stride, p1, q0);
-  Load8x4(r8, stride, p0, q1);
+  Load8x4_SSE2(r0, stride, p1, q0);
+  Load8x4_SSE2(r8, stride, p0, q1);
 
   {
     // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
@@ -531,7 +536,8 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
   }
 }
 
-static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
+static WEBP_INLINE void Store4x4_SSE2(__m128i* const x,
+                                      uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
     WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
@@ -540,12 +546,12 @@ static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
 }
 
 // Transpose back and store
-static WEBP_INLINE void Store16x4(const __m128i* const p1,
-                                  const __m128i* const p0,
-                                  const __m128i* const q0,
-                                  const __m128i* const q1,
-                                  uint8_t* r0, uint8_t* r8,
-                                  int stride) {
+static WEBP_INLINE void Store16x4_SSE2(const __m128i* const p1,
+                                       const __m128i* const p0,
+                                       const __m128i* const q0,
+                                       const __m128i* const q1,
+                                       uint8_t* r0, uint8_t* r8,
+                                       int stride) {
   __m128i t1, p1_s, p0_s, q0_s, q1_s;
 
   // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
@@ -572,55 +578,55 @@ static WEBP_INLINE void Store16x4(const __m128i* const p1,
   p1_s = _mm_unpacklo_epi16(t1, q1_s);
   q1_s = _mm_unpackhi_epi16(t1, q1_s);
 
-  Store4x4(&p0_s, r0, stride);
+  Store4x4_SSE2(&p0_s, r0, stride);
   r0 += 4 * stride;
-  Store4x4(&q0_s, r0, stride);
+  Store4x4_SSE2(&q0_s, r0, stride);
 
-  Store4x4(&p1_s, r8, stride);
+  Store4x4_SSE2(&p1_s, r8, stride);
   r8 += 4 * stride;
-  Store4x4(&q1_s, r8, stride);
+  Store4x4_SSE2(&q1_s, r8, stride);
 }
 
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16_SSE2(uint8_t* p, int stride, int thresh) {
   // Load
   __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
   __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
   __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
   __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
 
-  DoFilter2(&p1, &p0, &q0, &q1, thresh);
+  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
 
   // Store
   _mm_storeu_si128((__m128i*)&p[-stride], p0);
   _mm_storeu_si128((__m128i*)&p[0], q0);
 }
 
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16_SSE2(uint8_t* p, int stride, int thresh) {
   __m128i p1, p0, q0, q1;
 
   p -= 2;  // beginning of p1
 
-  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
-  DoFilter2(&p1, &p0, &q0, &q1, thresh);
-  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
+  Load16x4_SSE2(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
+  DoFilter2_SSE2(&p1, &p0, &q0, &q1, thresh);
+  Store16x4_SSE2(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
 }
 
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleVFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
+    SimpleVFilter16_SSE2(p, stride, thresh);
   }
 }
 
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+static void SimpleHFilter16i_SSE2(uint8_t* p, int stride, int thresh) {
   int k;
   for (k = 3; k > 0; --k) {
     p += 4;
-    SimpleHFilter16(p, stride, thresh);
+    SimpleHFilter16_SSE2(p, stride, thresh);
   }
 }
 
@@ -628,60 +634,60 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 // Complex In-loop filtering (Paragraph 15.3)
 
 #define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
-  m = MM_ABS(p1, p0);                                                          \
-  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  (m) = MM_ABS(p1, p0);                                                        \
+  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)
 
 #define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
-  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
-  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
+  (m) = _mm_max_epu8(m, MM_ABS(p1, p0));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p3, p2));                                       \
+  (m) = _mm_max_epu8(m, MM_ABS(p2, p1));                                       \
 } while (0)
 
 #define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
-  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
-  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
-  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
-  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
+  (e1) = _mm_loadu_si128((__m128i*)&(p)[0 * (stride)]);                        \
+  (e2) = _mm_loadu_si128((__m128i*)&(p)[1 * (stride)]);                        \
+  (e3) = _mm_loadu_si128((__m128i*)&(p)[2 * (stride)]);                        \
+  (e4) = _mm_loadu_si128((__m128i*)&(p)[3 * (stride)]);                        \
 }
 
 #define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
   const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
   const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
-  p = _mm_unpacklo_epi64(U, V);                                                \
+  (p) = _mm_unpacklo_epi64(U, V);                                              \
 } while (0)
 
 #define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
-  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
-  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
-  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
-  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
+  LOADUV_H_EDGE(e1, u, v, 0 * (stride));                                       \
+  LOADUV_H_EDGE(e2, u, v, 1 * (stride));                                       \
+  LOADUV_H_EDGE(e3, u, v, 2 * (stride));                                       \
+  LOADUV_H_EDGE(e4, u, v, 3 * (stride));                                       \
 }
 
 #define STOREUV(p, u, v, stride) {                                             \
-  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
-  p = _mm_srli_si128(p, 8);                                                    \
-  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
+  _mm_storel_epi64((__m128i*)&(u)[(stride)], p);                               \
+  (p) = _mm_srli_si128(p, 8);                                                  \
+  _mm_storel_epi64((__m128i*)&(v)[(stride)], p);                               \
 }
 
-static WEBP_INLINE void ComplexMask(const __m128i* const p1,
-                                    const __m128i* const p0,
-                                    const __m128i* const q0,
-                                    const __m128i* const q1,
-                                    int thresh, int ithresh,
-                                    __m128i* const mask) {
+static WEBP_INLINE void ComplexMask_SSE2(const __m128i* const p1,
+                                         const __m128i* const p0,
+                                         const __m128i* const q0,
+                                         const __m128i* const q1,
+                                         int thresh, int ithresh,
+                                         __m128i* const mask) {
   const __m128i it = _mm_set1_epi8(ithresh);
   const __m128i diff = _mm_subs_epu8(*mask, it);
   const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
   __m128i filter_mask;
-  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
+  NeedsFilter_SSE2(p1, p0, q0, q1, thresh, &filter_mask);
   *mask = _mm_and_si128(thresh_mask, filter_mask);
 }
 
 // on macroblock edges
-static void VFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter16_SSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i t1;
   __m128i mask;
   __m128i p2, p1, p0, q0, q1, q2;
@@ -694,8 +700,8 @@ static void VFilter16(uint8_t* p, int stride,
   LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
   MAX_DIFF2(t1, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
   // Store
   _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
@@ -706,28 +712,28 @@ static void VFilter16(uint8_t* p, int stride,
   _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
 }
 
-static void HFilter16(uint8_t* p, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter16_SSE2(uint8_t* p, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
   uint8_t* const b = p - 4;
-  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  Load16x4_SSE2(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
   MAX_DIFF1(p3, p2, p1, p0, mask);
 
-  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
+  Load16x4_SSE2(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
   MAX_DIFF2(q3, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
-  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
-  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
+  Store16x4_SSE2(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
+  Store16x4_SSE2(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
 }
 
 // on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void VFilter16i_SSE2(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   int k;
   __m128i p3, p2, p1, p0;   // loop invariants
 
@@ -744,8 +750,8 @@ static void VFilter16i(uint8_t* p, int stride,
 
     // p3 and p2 are not just temporary variables here: they will be
     // re-used for next span. And q2/q3 will become p1/p0 accordingly.
-    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
 
     // Store
     _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
@@ -759,12 +765,12 @@ static void VFilter16i(uint8_t* p, int stride,
   }
 }
 
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
+static void HFilter16i_SSE2(uint8_t* p, int stride,
+                            int thresh, int ithresh, int hev_thresh) {
   int k;
   __m128i p3, p2, p1, p0;   // loop invariants
 
-  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
+  Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
 
   for (k = 3; k > 0; --k) {
     __m128i mask, tmp1, tmp2;
@@ -773,13 +779,13 @@ static void HFilter16i(uint8_t* p, int stride,
     p += 4;  // beginning of q0 (and next span)
 
     MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
-    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
+    Load16x4_SSE2(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
     MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
 
-    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
-    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
+    ComplexMask_SSE2(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
+    DoFilter4_SSE2(&p1, &p0, &p3, &p2, &mask, hev_thresh);
 
-    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
+    Store16x4_SSE2(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
 
     // rotate samples
     p1 = tmp1;
@@ -788,8 +794,8 @@ static void HFilter16i(uint8_t* p, int stride,
 }
 
 // 8-pixels wide variant, for chroma filtering
-static void VFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void VFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, p2, p1, p0, q0, q1, q2;
 
@@ -801,8 +807,8 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
   LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
   MAX_DIFF2(t1, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
   // Store
   STOREUV(p2, u, v, -3 * stride);
@@ -813,28 +819,28 @@ static void VFilter8(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q2, u, v, 2 * stride);
 }
 
-static void HFilter8(uint8_t* u, uint8_t* v, int stride,
-                     int thresh, int ithresh, int hev_thresh) {
+static void HFilter8_SSE2(uint8_t* u, uint8_t* v, int stride,
+                          int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
   uint8_t* const tu = u - 4;
   uint8_t* const tv = v - 4;
-  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
+  Load16x4_SSE2(tu, tv, stride, &p3, &p2, &p1, &p0);
   MAX_DIFF1(p3, p2, p1, p0, mask);
 
-  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
+  Load16x4_SSE2(u, v, stride, &q0, &q1, &q2, &q3);
   MAX_DIFF2(q3, q2, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter6_SSE2(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
 
-  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
-  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
+  Store16x4_SSE2(&p3, &p2, &p1, &p0, tu, tv, stride);
+  Store16x4_SSE2(&q0, &q1, &q2, &q3, u, v, stride);
 }
 
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void VFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
 
@@ -849,8 +855,8 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
   LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
   MAX_DIFF2(t2, t1, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
 
   // Store
   STOREUV(p1, u, v, -2 * stride);
@@ -859,24 +865,24 @@ static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
   STOREUV(q1, u, v, 1 * stride);
 }
 
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
+static void HFilter8i_SSE2(uint8_t* u, uint8_t* v, int stride,
+                           int thresh, int ithresh, int hev_thresh) {
   __m128i mask;
   __m128i t1, t2, p1, p0, q0, q1;
-  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
+  Load16x4_SSE2(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
   MAX_DIFF1(t2, t1, p1, p0, mask);
 
   u += 4;  // beginning of q0
   v += 4;
-  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
+  Load16x4_SSE2(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
   MAX_DIFF2(t2, t1, q1, q0, mask);
 
-  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
-  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
+  ComplexMask_SSE2(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
+  DoFilter4_SSE2(&p1, &p0, &q0, &q1, &mask, hev_thresh);
 
   u -= 2;  // beginning of p1
   v -= 2;
-  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
+  Store16x4_SSE2(&p1, &p0, &q0, &q1, u, v, stride);
 }
 
 //------------------------------------------------------------------------------
@@ -893,7 +899,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
 
-static void VE4(uint8_t* dst) {    // vertical
+static void VE4_SSE2(uint8_t* dst) {    // vertical
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -909,7 +915,7 @@ static void VE4(uint8_t* dst) {    // vertical
   }
 }
 
-static void LD4(uint8_t* dst) {   // Down-Left
+static void LD4_SSE2(uint8_t* dst) {   // Down-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -925,7 +931,7 @@ static void LD4(uint8_t* dst) {   // Down-Left
   WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
-static void VR4(uint8_t* dst) {   // Vertical-Right
+static void VR4_SSE2(uint8_t* dst) {   // Vertical-Right
   const __m128i one = _mm_set1_epi8(1);
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
@@ -950,7 +956,7 @@ static void VR4(uint8_t* dst) {   // Vertical-Right
   DST(0, 3) = AVG3(K, J, I);
 }
 
-static void VL4(uint8_t* dst) {   // Vertical-Left
+static void VL4_SSE2(uint8_t* dst) {   // Vertical-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
   const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -975,7 +981,7 @@ static void VL4(uint8_t* dst) {   // Vertical-Left
   DST(3, 3) = (extra_out >> 8) & 0xff;
 }
 
-static void RD4(uint8_t* dst) {   // Down-right
+static void RD4_SSE2(uint8_t* dst) {   // Down-right
   const __m128i one = _mm_set1_epi8(1);
   const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
   const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
@@ -1004,7 +1010,7 @@ static void RD4(uint8_t* dst) {   // Down-right
 //------------------------------------------------------------------------------
 // Luma 16x16
 
-static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, int size) {
   const uint8_t* top = dst - BPS;
   const __m128i zero = _mm_setzero_si128();
   int y;
@@ -1041,11 +1047,11 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   }
 }
 
-static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+static void TM4_SSE2(uint8_t* dst)   { TrueMotion_SSE2(dst, 4); }
+static void TM8uv_SSE2(uint8_t* dst) { TrueMotion_SSE2(dst, 8); }
+static void TM16_SSE2(uint8_t* dst)  { TrueMotion_SSE2(dst, 16); }
 
-static void VE16(uint8_t* dst) {
+static void VE16_SSE2(uint8_t* dst) {
   const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
   int j;
   for (j = 0; j < 16; ++j) {
@@ -1053,7 +1059,7 @@ static void VE16(uint8_t* dst) {
   }
 }
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_SSE2(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
     const __m128i values = _mm_set1_epi8(dst[-1]);
@@ -1062,7 +1068,7 @@ static void HE16(uint8_t* dst) {     // horizontal
   }
 }
 
-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 16; ++j) {
@@ -1070,7 +1076,7 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
   }
 }
 
-static void DC16(uint8_t* dst) {    // DC
+static void DC16_SSE2(uint8_t* dst) {  // DC
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
   const __m128i sad8x2 = _mm_sad_epu8(top, zero);
@@ -1083,37 +1089,37 @@ static void DC16(uint8_t* dst) {    // DC
   }
   {
     const int DC = _mm_cvtsi128_si32(sum) + left + 16;
-    Put16(DC >> 5, dst);
+    Put16_SSE2(DC >> 5, dst);
   }
 }
 
-static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+static void DC16NoTop_SSE2(uint8_t* dst) {  // DC with top samples unavailable
   int DC = 8;
   int j;
   for (j = 0; j < 16; ++j) {
     DC += dst[-1 + j * BPS];
   }
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }
 
-static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+static void DC16NoLeft_SSE2(uint8_t* dst) {  // DC with left samples unavailable
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
   const __m128i sad8x2 = _mm_sad_epu8(top, zero);
   // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
   const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
   const int DC = _mm_cvtsi128_si32(sum) + 8;
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }
 
-static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
-  Put16(0x80, dst);
+static void DC16NoTopLeft_SSE2(uint8_t* dst) {  // DC with no top & left samples
+  Put16_SSE2(0x80, dst);
 }
 
 //------------------------------------------------------------------------------
 // Chroma
 
-static void VE8uv(uint8_t* dst) {    // vertical
+static void VE8uv_SSE2(uint8_t* dst) {    // vertical
   int j;
   const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
   for (j = 0; j < 8; ++j) {
@@ -1121,17 +1127,8 @@ static void VE8uv(uint8_t* dst) {    // vertical
   }
 }
 
-static void HE8uv(uint8_t* dst) {    // horizontal
-  int j;
-  for (j = 0; j < 8; ++j) {
-    const __m128i values = _mm_set1_epi8(dst[-1]);
-    _mm_storel_epi64((__m128i*)dst, values);
-    dst += BPS;
-  }
-}
-
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 8; ++j) {
@@ -1139,7 +1136,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
   }
 }
 
-static void DC8uv(uint8_t* dst) {     // DC
+static void DC8uv_SSE2(uint8_t* dst) {     // DC
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
   const __m128i sum = _mm_sad_epu8(top, zero);
@@ -1150,29 +1147,29 @@ static void DC8uv(uint8_t* dst) {     // DC
   }
   {
     const int DC = _mm_cvtsi128_si32(sum) + left + 8;
-    Put8x8uv(DC >> 4, dst);
+    Put8x8uv_SSE2(DC >> 4, dst);
   }
 }
 
-static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+static void DC8uvNoLeft_SSE2(uint8_t* dst) {   // DC with no left samples
   const __m128i zero = _mm_setzero_si128();
   const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
   const __m128i sum = _mm_sad_epu8(top, zero);
   const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv(DC >> 3, dst);
+  Put8x8uv_SSE2(DC >> 3, dst);
 }
 
-static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+static void DC8uvNoTop_SSE2(uint8_t* dst) {  // DC with no top samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
     dc0 += dst[-1 + i * BPS];
   }
-  Put8x8uv(dc0 >> 3, dst);
+  Put8x8uv_SSE2(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
-  Put8x8uv(0x80, dst);
+static void DC8uvNoTopLeft_SSE2(uint8_t* dst) {    // DC with nothing
+  Put8x8uv_SSE2(0x80, dst);
 }
 
 //------------------------------------------------------------------------------
@@ -1181,47 +1178,46 @@ static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
 extern void VP8DspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
-  VP8Transform = Transform;
-#if defined(USE_TRANSFORM_AC3)
-  VP8TransformAC3 = TransformAC3;
+  VP8Transform = Transform_SSE2;
+#if (USE_TRANSFORM_AC3 == 1)
+  VP8TransformAC3 = TransformAC3_SSE2;
 #endif
 
-  VP8VFilter16 = VFilter16;
-  VP8HFilter16 = HFilter16;
-  VP8VFilter8 = VFilter8;
-  VP8HFilter8 = HFilter8;
-  VP8VFilter16i = VFilter16i;
-  VP8HFilter16i = HFilter16i;
-  VP8VFilter8i = VFilter8i;
-  VP8HFilter8i = HFilter8i;
-
-  VP8SimpleVFilter16 = SimpleVFilter16;
-  VP8SimpleHFilter16 = SimpleHFilter16;
-  VP8SimpleVFilter16i = SimpleVFilter16i;
-  VP8SimpleHFilter16i = SimpleHFilter16i;
-
-  VP8PredLuma4[1] = TM4;
-  VP8PredLuma4[2] = VE4;
-  VP8PredLuma4[4] = RD4;
-  VP8PredLuma4[5] = VR4;
-  VP8PredLuma4[6] = LD4;
-  VP8PredLuma4[7] = VL4;
-
-  VP8PredLuma16[0] = DC16;
-  VP8PredLuma16[1] = TM16;
-  VP8PredLuma16[2] = VE16;
-  VP8PredLuma16[3] = HE16;
-  VP8PredLuma16[4] = DC16NoTop;
-  VP8PredLuma16[5] = DC16NoLeft;
-  VP8PredLuma16[6] = DC16NoTopLeft;
-
-  VP8PredChroma8[0] = DC8uv;
-  VP8PredChroma8[1] = TM8uv;
-  VP8PredChroma8[2] = VE8uv;
-  VP8PredChroma8[3] = HE8uv;
-  VP8PredChroma8[4] = DC8uvNoTop;
-  VP8PredChroma8[5] = DC8uvNoLeft;
-  VP8PredChroma8[6] = DC8uvNoTopLeft;
+  VP8VFilter16 = VFilter16_SSE2;
+  VP8HFilter16 = HFilter16_SSE2;
+  VP8VFilter8 = VFilter8_SSE2;
+  VP8HFilter8 = HFilter8_SSE2;
+  VP8VFilter16i = VFilter16i_SSE2;
+  VP8HFilter16i = HFilter16i_SSE2;
+  VP8VFilter8i = VFilter8i_SSE2;
+  VP8HFilter8i = HFilter8i_SSE2;
+
+  VP8SimpleVFilter16 = SimpleVFilter16_SSE2;
+  VP8SimpleHFilter16 = SimpleHFilter16_SSE2;
+  VP8SimpleVFilter16i = SimpleVFilter16i_SSE2;
+  VP8SimpleHFilter16i = SimpleHFilter16i_SSE2;
+
+  VP8PredLuma4[1] = TM4_SSE2;
+  VP8PredLuma4[2] = VE4_SSE2;
+  VP8PredLuma4[4] = RD4_SSE2;
+  VP8PredLuma4[5] = VR4_SSE2;
+  VP8PredLuma4[6] = LD4_SSE2;
+  VP8PredLuma4[7] = VL4_SSE2;
+
+  VP8PredLuma16[0] = DC16_SSE2;
+  VP8PredLuma16[1] = TM16_SSE2;
+  VP8PredLuma16[2] = VE16_SSE2;
+  VP8PredLuma16[3] = HE16_SSE2;
+  VP8PredLuma16[4] = DC16NoTop_SSE2;
+  VP8PredLuma16[5] = DC16NoLeft_SSE2;
+  VP8PredLuma16[6] = DC16NoTopLeft_SSE2;
+
+  VP8PredChroma8[0] = DC8uv_SSE2;
+  VP8PredChroma8[1] = TM8uv_SSE2;
+  VP8PredChroma8[2] = VE8uv_SSE2;
+  VP8PredChroma8[4] = DC8uvNoTop_SSE2;
+  VP8PredChroma8[5] = DC8uvNoLeft_SSE2;
+  VP8PredChroma8[6] = DC8uvNoTopLeft_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/dec_sse41.c b/thirdparty/libwebp/src/dsp/dec_sse41.c
index 4e81ec4d80..8f18506d54 100644
--- a/thirdparty/libwebp/dsp/dec_sse41.c
+++ b/thirdparty/libwebp/src/dsp/dec_sse41.c
@@ -11,15 +11,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 
 #include <smmintrin.h>
-#include "../dec/vp8i_dec.h"
-#include "../utils/utils.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/utils/utils.h"
 
-static void HE16(uint8_t* dst) {     // horizontal
+static void HE16_SSE41(uint8_t* dst) {     // horizontal
   int j;
   const __m128i kShuffle3 = _mm_set1_epi8(3);
   for (j = 16; j > 0; --j) {
@@ -36,7 +36,7 @@ static void HE16(uint8_t* dst) {     // horizontal
 extern void VP8DspInitSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
-  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[3] = HE16_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/dsp.h b/thirdparty/libwebp/src/dsp/dsp.h
index 813fed4a35..99eefe092f 100644
--- a/thirdparty/libwebp/dsp/dsp.h
+++ b/thirdparty/libwebp/src/dsp/dsp.h
@@ -15,10 +15,10 @@
 #define WEBP_DSP_DSP_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,10 +38,22 @@ extern "C" {
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 
+#if defined(__clang__)
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif
+
 #ifndef __has_builtin
 # define __has_builtin(x) 0
 #endif
 
+// for now, none of the optimizations below are available in emscripten
+#if !defined(EMSCRIPTEN)
+
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
     (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -68,18 +80,20 @@ extern "C" {
 #define WEBP_USE_AVX2
 #endif
 
-#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
-#define WEBP_ANDROID_NEON  // Android targets that might support NEON
-#endif
-
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
-#if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
+#if (defined(__ARM_NEON__) || \
      defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
     !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif
 
+#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
+    defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
+#define WEBP_ANDROID_NEON  // Android targets that may have NEON
+#define WEBP_USE_NEON
+#endif
+
 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
 #define WEBP_USE_NEON
 #define WEBP_USE_INTRINSICS
@@ -90,7 +104,7 @@ extern "C" {
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
-#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
 #define WEBP_USE_MIPS_DSP_R2
 #endif
 #endif
@@ -100,6 +114,24 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif
 
+#endif  /* EMSCRIPTEN */
+
+#ifndef WEBP_DSP_OMIT_C_CODE
+#define WEBP_DSP_OMIT_C_CODE 1
+#endif
+
+#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
+#define WEBP_NEON_OMIT_C_CODE 1
+#else
+#define WEBP_NEON_OMIT_C_CODE 0
+#endif
+
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#define WEBP_NEON_WORK_AROUND_GCC 1
+#else
+#define WEBP_NEON_WORK_AROUND_GCC 0
+#endif
+
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
@@ -129,6 +161,11 @@ extern "C" {
 #endif
 #endif
 
+// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define WEBP_SWAP_16BIT_CSP 0
+#endif
+
 typedef enum {
   kSSE2,
   kSSE3,
@@ -143,7 +180,7 @@ typedef enum {
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
-WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 
 //------------------------------------------------------------------------------
 // Init stub generator
@@ -271,6 +308,7 @@ typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
                                         int xo, int yo,  // center position
                                         int W, int H);   // plane dimension
 
+#if !defined(WEBP_REDUCE_SIZE)
 // This version is called with the guarantee that you can load 8 bytes and
 // 8 rows at offset src1 and src2
 typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
@@ -278,10 +316,13 @@ typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
 
 extern VP8SSIMGetFunc VP8SSIMGet;         // unclipped / unchecked
 extern VP8SSIMGetClippedFunc VP8SSIMGetClipped;   // with clipping
+#endif
 
+#if !defined(WEBP_DISABLE_STATS)
 typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
                                          const uint8_t* src2, int len);
 extern VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
 
 // must be called before using any of the above directly
 void VP8SSIMDspInit(void);
@@ -462,12 +503,12 @@ extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
 extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 
 // Plain-C implementation, as fall-back.
-extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
-                                         const uint8_t* src);
-extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
-                                         const uint8_t* src);
-extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
-extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
 
 // Main entry calls:
 extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
@@ -533,24 +574,21 @@ void WebPMultRows(uint8_t* ptr, int stride,
                   int width, int num_rows, int inverse);
 
 // Plain-C versions, used as fallback by some implementations.
-void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
-                  int width, int inverse);
-void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
-
-// To be called first before using the above.
-void WebPInitAlphaProcessing(void);
-
-// ARGB packing function: a/r/g/b input is rgba or bgra order.
-extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
-                           const uint8_t* g, const uint8_t* b, int len,
-                           uint32_t* out);
+void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
+                   int width, int inverse);
+void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
 
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
-extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
-                          int len, int step, uint32_t* out);
+extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                           int len, int step, uint32_t* out);
+
+// This function returns true if src[i] contains a value different from 0xff.
+extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
+// This function returns true if src[4*i] contains a value different from 0xff.
+extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
 
 // To be called first before using the above.
-void VP8EncDspARGBInit(void);
+void WebPInitAlphaProcessing(void);
 
 //------------------------------------------------------------------------------
 // Filter functions
diff --git a/thirdparty/libwebp/dsp/enc.c b/thirdparty/libwebp/src/dsp/enc.c
index f31bc6de18..1c807f1df7 100644
--- a/thirdparty/libwebp/dsp/enc.c
+++ b/thirdparty/libwebp/src/dsp/enc.c
@@ -14,16 +14,18 @@
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 
-#include "./dsp.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/enc/vp8i_enc.h"
 
 static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int clip_max(int v, int max) {
   return (v > max) ? max : v;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms:
@@ -56,9 +58,10 @@ void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
   histo->last_non_zero = last_non_zero;
 }
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
+                               int start_block, int end_block,
+                               VP8Histogram* const histo) {
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
@@ -76,6 +79,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
   }
   VP8SetHistogramData(distribution, histo);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // run-time tables (~4k)
@@ -100,6 +104,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
+#if !WEBP_NEON_OMIT_C_CODE
+
 #define STORE(x, y, v) \
   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
 
@@ -140,15 +146,15 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   }
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                         int do_two) {
   ITransformOne(ref, in, dst);
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   int i;
   int tmp[16];
   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
@@ -176,13 +182,16 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
     out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
+                          int16_t* out) {
   VP8FTransform(src, ref, out);
   VP8FTransform(src + 4, ref + 4, out + 16);
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void FTransformWHT_C(const int16_t* in, int16_t* out) {
   // input is 12b signed
   int32_t tmp[16];
   int i;
@@ -211,6 +220,7 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
     out[12 + i] = b3 >> 1;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 #undef MUL
 #undef STORE
@@ -303,8 +313,8 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
+                               const uint8_t* top) {
   // U block
   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   VerticalPred(C8VE8 + dst, top, 8);
@@ -323,8 +333,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_C(uint8_t* dst,
+                           const uint8_t* left, const uint8_t* top) {
   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
   VerticalPred(I16VE16 + dst, top, 16);
   HorizontalPred(I16HE16 + dst, left, 16);
@@ -507,7 +517,7 @@ static void TM4(uint8_t* dst, const uint8_t* top) {
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -523,6 +533,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Metric
 
+#if !WEBP_NEON_OMIT_C_CODE
 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
                               int w, int h) {
   int count = 0;
@@ -538,20 +549,21 @@ static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
   return count;
 }
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 16, 16);
 }
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 16, 8);
 }
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 8, 8);
 }
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 4, 4);
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
   int k, x, y;
   for (k = 0; k < 4; ++k) {
     uint32_t avg = 0;
@@ -571,6 +583,7 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.
 
+#if !WEBP_NEON_OMIT_C_CODE
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
@@ -608,24 +621,25 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
   return sum;
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
   const int sum1 = TTransform(a, w);
   const int sum2 = TTransform(b, w);
   return abs(sum2 - sum1) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_C(a + x + y, b + x + y, w);
     }
   }
   return D;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Quantization
@@ -636,8 +650,8 @@ static const uint8_t kZigzag[16] = {
 };
 
 // Simple quantization
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
+                           const VP8Matrix* const mtx) {
   int last = -1;
   int n;
   for (n = 0; n < 16; ++n) {
@@ -662,13 +676,15 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (last >= 0);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
+                             const VP8Matrix* const mtx) {
   int nz;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 // Block copy
@@ -682,149 +698,15 @@ static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
   }
 }
 
-static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
   Copy(src, dst, 4, 4);
 }
 
-static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
   Copy(src, dst, 16, 8);
 }
 
 //------------------------------------------------------------------------------
-// SSIM / PSNR
-
-// hat-shaped filter. Sum of coefficients is equal to 16.
-static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
-  1, 2, 3, 4, 3, 2, 1
-};
-static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
-
-static WEBP_INLINE double SSIMCalculation(
-    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
-  const uint32_t w2 =  N * N;
-  const uint32_t C1 = 20 * w2;
-  const uint32_t C2 = 60 * w2;
-  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
-  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
-  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
-  if (xmxm + ymym >= C3) {
-    const int64_t xmym = (int64_t)stats->xm * stats->ym;
-    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
-    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
-    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
-    // we descale by 8 to prevent overflow during the fnum/fden multiply.
-    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
-    const uint64_t den_S = (sxx + syy + C2) >> 8;
-    const uint64_t fnum = (2 * xmym + C1) * num_S;
-    const uint64_t fden = (xmxm + ymym + C1) * den_S;
-    const double r = (double)fnum / fden;
-    assert(r >= 0. && r <= 1.0);
-    return r;
-  }
-  return 1.;   // area is too dark to contribute meaningfully
-}
-
-double VP8SSIMFromStats(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, kWeightSum);
-}
-
-double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
-  return SSIMCalculation(stats, stats->w);
-}
-
-static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
-                               const uint8_t* src2, int stride2,
-                               int xo, int yo, int W, int H) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
-  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
-                                                  : yo + VP8_SSIM_KERNEL;
-  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
-  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
-                                                  : xo + VP8_SSIM_KERNEL;
-  int x, y;
-  src1 += ymin * stride1;
-  src2 += ymin * stride2;
-  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
-    for (x = xmin; x <= xmax; ++x) {
-      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
-                       * kWeight[VP8_SSIM_KERNEL + y - yo];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.w   += w;
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStatsClipped(&stats);
-}
-
-static double SSIMGet_C(const uint8_t* src1, int stride1,
-                        const uint8_t* src2, int stride2) {
-  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
-  int x, y;
-  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
-    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const uint32_t w = kWeight[x] * kWeight[y];
-      const uint32_t s1 = src1[x];
-      const uint32_t s2 = src2[x];
-      stats.xm  += w * s1;
-      stats.ym  += w * s2;
-      stats.xxm += w * s1 * s1;
-      stats.xym += w * s1 * s2;
-      stats.yym += w * s2 * s2;
-    }
-  }
-  return VP8SSIMFromStats(&stats);
-}
-
-//------------------------------------------------------------------------------
-
-static uint32_t AccumulateSSE(const uint8_t* src1,
-                              const uint8_t* src2, int len) {
-  int i;
-  uint32_t sse2 = 0;
-  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
-  for (i = 0; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-//------------------------------------------------------------------------------
-
-VP8SSIMGetFunc VP8SSIMGet;
-VP8SSIMGetClippedFunc VP8SSIMGetClipped;
-VP8AccumulateSSEFunc VP8AccumulateSSE;
-
-extern void VP8SSIMDspInitSSE2(void);
-
-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
-    (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
-  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
-  VP8SSIMGetClipped = SSIMGetClipped_C;
-  VP8SSIMGet = SSIMGet_C;
-
-  VP8AccumulateSSE = AccumulateSSE;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8SSIMDspInitSSE2();
-    }
-#endif
-  }
-
-  ssim_last_cpuinfo_used = VP8GetCPUInfo;
-}
-
-//------------------------------------------------------------------------------
 // Initialization
 
 // Speed-critical function pointers. We have to initialize them to the default
@@ -868,26 +750,32 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
   InitTables();
 
   // default C implementations
-  VP8CollectHistogram = CollectHistogram;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransform2 = FTransform2;
-  VP8FTransformWHT = FTransformWHT;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8Mean16x4 = Mean16x4;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlock;
-  VP8Copy4x4 = Copy4x4;
-  VP8Copy16x8 = Copy16x8;
+#if !WEBP_NEON_OMIT_C_CODE
+  VP8ITransform = ITransform_C;
+  VP8FTransform = FTransform_C;
+  VP8FTransformWHT = FTransformWHT_C;
+  VP8TDisto4x4 = Disto4x4_C;
+  VP8TDisto16x16 = Disto16x16_C;
+  VP8CollectHistogram = CollectHistogram_C;
+  VP8SSE16x16 = SSE16x16_C;
+  VP8SSE16x8 = SSE16x8_C;
+  VP8SSE8x8 = SSE8x8_C;
+  VP8SSE4x4 = SSE4x4_C;
+#endif
+
+#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
+  VP8EncQuantizeBlock = QuantizeBlock_C;
+  VP8EncQuantize2Blocks = Quantize2Blocks_C;
+#endif
+
+  VP8FTransform2 = FTransform2_C;
+  VP8EncPredLuma4 = Intra4Preds_C;
+  VP8EncPredLuma16 = Intra16Preds_C;
+  VP8EncPredChroma8 = IntraChromaPreds_C;
+  VP8Mean16x4 = Mean16x4_C;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_C;
+  VP8Copy4x4 = Copy4x4_C;
+  VP8Copy16x8 = Copy16x8_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -906,11 +794,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
       VP8EncDspInitAVX2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8EncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8EncDspInitMIPS32();
@@ -927,5 +810,34 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8EncDspInitNEON();
+  }
+#endif
+
+  assert(VP8ITransform != NULL);
+  assert(VP8FTransform != NULL);
+  assert(VP8FTransformWHT != NULL);
+  assert(VP8TDisto4x4 != NULL);
+  assert(VP8TDisto16x16 != NULL);
+  assert(VP8CollectHistogram != NULL);
+  assert(VP8SSE16x16 != NULL);
+  assert(VP8SSE16x8 != NULL);
+  assert(VP8SSE8x8 != NULL);
+  assert(VP8SSE4x4 != NULL);
+  assert(VP8EncQuantizeBlock != NULL);
+  assert(VP8EncQuantize2Blocks != NULL);
+  assert(VP8FTransform2 != NULL);
+  assert(VP8EncPredLuma4 != NULL);
+  assert(VP8EncPredLuma16 != NULL);
+  assert(VP8EncPredChroma8 != NULL);
+  assert(VP8Mean16x4 != NULL);
+  assert(VP8EncQuantizeBlockWHT != NULL);
+  assert(VP8Copy4x4 != NULL);
+  assert(VP8Copy16x8 != NULL);
+
   enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/enc_avx2.c b/thirdparty/libwebp/src/dsp/enc_avx2.c
index 93efb30b10..8bc5798fee 100644
--- a/thirdparty/libwebp/dsp/enc_avx2.c
+++ b/thirdparty/libwebp/src/dsp/enc_avx2.c
@@ -9,7 +9,7 @@
 //
 // AVX2 version of speed-critical encoding functions.
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_AVX2)
 
diff --git a/thirdparty/libwebp/dsp/enc_mips32.c b/thirdparty/libwebp/src/dsp/enc_mips32.c
index 752b14daf6..618f0fc0ee 100644
--- a/thirdparty/libwebp/dsp/enc_mips32.c
+++ b/thirdparty/libwebp/src/dsp/enc_mips32.c
@@ -13,13 +13,13 @@
 //            Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 //            Slobodan Prijic  (slobodan.prijic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "./mips_macro.h"
-#include "../enc/vp8i_enc.h"
-#include "../enc/cost_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@@ -113,8 +113,9 @@ static const int kC2 = 35468;
   "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
 
 // Does one or two inverse transforms.
-static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
-                                      uint8_t* dst) {
+static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
+                                             const int16_t* in,
+                                             uint8_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
   int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
   int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
@@ -144,11 +145,11 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   );
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in,
-                       uint8_t* dst, int do_two) {
-  ITransformOne(ref, in, dst);
+static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
+                              uint8_t* dst, int do_two) {
+  ITransformOne_MIPS32(ref, in, dst);
   if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
+    ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
   }
 }
 
@@ -187,8 +188,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in,
   "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \
   "sh           %[level],       " #N "(%[pout])                     \n\t"
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
+                                const VP8Matrix* const mtx) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   int sign, coeff, level, i;
   int max_level = MAX_LEVEL;
@@ -238,11 +239,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
+                                  const VP8Matrix* const mtx) {
   int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
 
@@ -361,8 +362,8 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
   "msub   %[temp6],  %[temp0]                \n\t"                \
   "msub   %[temp7],  %[temp1]                \n\t"
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
   int tmp[32];
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
 
@@ -396,13 +397,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
+                             const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -478,7 +479,8 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
   "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \
   "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t"
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
+                              int16_t* out) {
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
   int temp17, temp18, temp19, temp20;
@@ -539,7 +541,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
   GET_SSE_INNER(D, D + 1, D + 2, D + 3)
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -573,7 +575,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -599,7 +601,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -621,7 +623,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
@@ -651,17 +653,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 extern void VP8EncDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
+  VP8ITransform = ITransform_MIPS32;
+  VP8FTransform = FTransform_MIPS32;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
+
+  VP8TDisto4x4 = Disto4x4_MIPS32;
+  VP8TDisto16x16 = Disto16x16_MIPS32;
+
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16_MIPS32;
+  VP8SSE8x8 = SSE8x8_MIPS32;
+  VP8SSE16x8 = SSE16x8_MIPS32;
+  VP8SSE4x4 = SSE4x4_MIPS32;
 #endif
 }
 
diff --git a/thirdparty/libwebp/dsp/enc_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
index 6c8c1c6acd..9ddd895086 100644
--- a/thirdparty/libwebp/dsp/enc_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -12,13 +12,13 @@
 // Author(s): Darko Laus (darko.laus@imgtec.com)
 //            Mirko Raus (mirko.raus@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./mips_macro.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/mips_macro.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
@@ -141,7 +141,8 @@ static const int kC2 = 35468;
   "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
   "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
+                                 int16_t* out) {
   const int c2217 = 2217;
   const int c5352 = 5352;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
@@ -238,16 +239,16 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   );
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
+                                 uint8_t* dst, int do_two) {
   ITransformOne(ref, in, dst);
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
+                              const uint16_t* const w) {
   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
 
@@ -313,13 +314,14 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
   return abs(temp3 - temp17) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MIPSdspR2(const uint8_t* const a,
+                                const uint8_t* const b,
+                                const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -1011,8 +1013,8 @@ static void HU4(uint8_t* dst, const uint8_t* top) {
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
   // U block
   DCMode8(C8DC8 + dst, left, top);
   VerticalPred8(C8VE8 + dst, top);
@@ -1031,8 +1033,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MIPSdspR2(uint8_t* dst,
+                                   const uint8_t* left, const uint8_t* top) {
   DCMode16(I16DC16 + dst, left, top);
   VerticalPred16(I16VE16 + dst, top);
   HorizontalPred16(I16HE16 + dst, left);
@@ -1041,7 +1043,7 @@ static void Intra16Preds(uint8_t* dst,
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -1077,7 +1079,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
   GET_SSE_INNER(C)                        \
   GET_SSE_INNER(D)
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1107,7 +1109,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1129,7 +1131,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1147,7 +1149,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return count;
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3;
   __asm__ volatile (
@@ -1270,8 +1272,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   "usw         $0,           " #J "(%[ppin])                 \n\t"        \
 "3:                                                          \n\t"
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
+                                   const VP8Matrix* const mtx) {
   int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
   int sign, coeff, level;
   int max_level = MAX_LEVEL;
@@ -1311,11 +1313,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (ret != 0);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
+                                     const VP8Matrix* const mtx) {
   int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
 
@@ -1358,7 +1360,7 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
   "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
   "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
   int temp0, temp1, temp2, temp3, temp4;
   int temp5, temp6, temp7, temp8, temp9;
 
@@ -1450,9 +1452,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   "sw         %[temp8],  0(%[temp3])                   \n\t"
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
+                                       int start_block, int end_block,
+                                       VP8Histogram* const histo) {
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
@@ -1484,23 +1486,28 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 extern void VP8EncDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
-  VP8FTransform = FTransform;
-  VP8ITransform = ITransform;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8EncPredLuma4 = Intra4Preds;
+  VP8FTransform = FTransform_MIPSdspR2;
+  VP8FTransformWHT = FTransformWHT_MIPSdspR2;
+  VP8ITransform = ITransform_MIPSdspR2;
+
+  VP8TDisto4x4 = Disto4x4_MIPSdspR2;
+  VP8TDisto16x16 = Disto16x16_MIPSdspR2;
+
+  VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
+  VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
+  VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
+
 #if !defined(WORK_AROUND_GCC)
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE4x4 = SSE4x4;
+  VP8SSE16x16 = SSE16x16_MIPSdspR2;
+  VP8SSE8x8 = SSE8x8_MIPSdspR2;
+  VP8SSE16x8 = SSE16x8_MIPSdspR2;
+  VP8SSE4x4 = SSE4x4_MIPSdspR2;
 #endif
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8FTransformWHT = FTransformWHT;
-  VP8CollectHistogram = CollectHistogram;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
+
+  VP8CollectHistogram = CollectHistogram_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/enc_msa.c b/thirdparty/libwebp/src/dsp/enc_msa.c
index 909b46d5d9..6f85add4bb 100644
--- a/thirdparty/libwebp/dsp/enc_msa.c
+++ b/thirdparty/libwebp/src/dsp/enc_msa.c
@@ -11,13 +11,13 @@
 //
 // Author:  Prashant Patil   (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
 #include <stdlib.h>
-#include "./msa_macro.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/msa_macro.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms
@@ -69,20 +69,21 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
   ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
 }
 
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                           int do_two) {
   ITransformOne(ref, in, dst);
   if (do_two) {
     ITransformOne(ref + 4, in + 16, dst + 4);
   }
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
+                           int16_t* out) {
   uint64_t out0, out1, out2, out3;
   uint32_t in0, in1, in2, in3;
   v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
   v8i16 t0, t1, t2, t3;
-  v16u8 srcl0, srcl1, src0, src1;
+  v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
   const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
   const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
   const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
@@ -130,7 +131,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   SD4(out0, out1, out2, out3, out, 8);
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
   v8i16 in0 = { 0 };
   v8i16 in1 = { 0 };
   v8i16 tmp0, tmp1, tmp2, tmp3;
@@ -167,10 +168,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
   ST_SH2(out0, out1, out, 8);
 }
 
-static int TTransform(const uint8_t* in, const uint16_t* w) {
+static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
   int sum;
   uint32_t in0_m, in1_m, in2_m, in3_m;
-  v16i8 src0;
+  v16i8 src0 = { 0 };
   v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
   v4i32 dst0, dst1;
   const v16i8 zero = { 0 };
@@ -199,20 +200,20 @@ static int TTransform(const uint8_t* in, const uint16_t* w) {
   return sum;
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int sum1 = TTransform(a, w);
-  const int sum2 = TTransform(b, w);
+static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
+                        const uint16_t* const w) {
+  const int sum1 = TTransform_MSA(a, w);
+  const int sum2 = TTransform_MSA(b, w);
   return abs(sum2 - sum1) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_MSA(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -221,9 +222,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 //------------------------------------------------------------------------------
 // Histogram
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
+                                 int start_block, int end_block,
+                                 VP8Histogram* const histo) {
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
@@ -259,8 +260,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const v16u8 A1 = { 0 };
   const uint64_t val_m = LD(top - 1);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
   const v16u8 B = SLDI_UB(A, A, 1);
   const v16u8 C = SLDI_UB(A, A, 2);
   const v16u8 AC = __msa_ave_u_b(A, C);
@@ -292,8 +294,9 @@ static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
 }
 
 static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A2 = { 0 };
   const uint64_t val_m = LD(top - 5);
-  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
   const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
   const v16u8 B = SLDI_UB(A, A, 1);
   const v16u8 C = SLDI_UB(A, A, 2);
@@ -311,8 +314,9 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
 }
 
 static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+  const v16u8 A1 = { 0 };
   const uint64_t val_m = LD(top);
-  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
   const v16u8 B = SLDI_UB(A, A, 1);
   const v16u8 C1 = SLDI_UB(A, A, 2);
   const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
@@ -427,7 +431,7 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 #undef AVG3
 #undef AVG2
 
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
   DC4(I4DC4 + dst, top);
   TM4(I4TM4 + dst, top);
   VE4(I4VE4 + dst, top);
@@ -544,8 +548,8 @@ static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
   STORE16x16(out, dst);
 }
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
+static void Intra16Preds_MSA(uint8_t* dst,
+                             const uint8_t* left, const uint8_t* top) {
   DCMode16x16(I16DC16 + dst, left, top);
   VerticalPred16x16(I16VE16 + dst, top);
   HorizontalPred16x16(I16HE16 + dst, left);
@@ -645,7 +649,7 @@ static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
 static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
                                   const uint8_t* top) {
   uint64_t out;
-  v16u8 src;
+  v16u8 src = { 0 };
   if (top != NULL && left != NULL) {
     const uint64_t left_m = LD(left);
     const uint64_t top_m = LD(top);
@@ -666,8 +670,8 @@ static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
   STORE8x8(out, dst);
 }
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
   // U block
   DCMode8x8(C8DC8 + dst, left, top);
   VerticalPred8x8(C8VE8 + dst, top);
@@ -708,7 +712,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
   DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
 } while (0)
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -735,7 +739,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return sum;
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -754,7 +758,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return sum;
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
@@ -774,10 +778,10 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return sum;
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
   uint32_t sum = 0;
   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src, ref, tmp0, tmp1;
+  v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
   v8i16 diff0, diff1;
   v4i32 out0, out1;
 
@@ -796,8 +800,8 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 //------------------------------------------------------------------------------
 // Quantization
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
+static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
+                             const VP8Matrix* const mtx) {
   int sum;
   v8i16 in0, in1, sh0, sh1, out0, out1;
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
@@ -828,7 +832,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   tmp1 = (tmp3 > maxlevel);
   tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
   tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
-  SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
+  SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
   tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
   tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
   LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
@@ -849,8 +853,8 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (sum > 0);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
+                               const VP8Matrix* const mtx) {
   int nz;
   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
@@ -863,26 +867,26 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransformWHT = FTransformWHT;
-
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8CollectHistogram = CollectHistogram;
-
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
-
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlock;
+  VP8ITransform = ITransform_MSA;
+  VP8FTransform = FTransform_MSA;
+  VP8FTransformWHT = FTransformWHT_MSA;
+
+  VP8TDisto4x4 = Disto4x4_MSA;
+  VP8TDisto16x16 = Disto16x16_MSA;
+  VP8CollectHistogram = CollectHistogram_MSA;
+
+  VP8EncPredLuma4 = Intra4Preds_MSA;
+  VP8EncPredLuma16 = Intra16Preds_MSA;
+  VP8EncPredChroma8 = IntraChromaPreds_MSA;
+
+  VP8SSE16x16 = SSE16x16_MSA;
+  VP8SSE16x8 = SSE16x8_MSA;
+  VP8SSE8x8 = SSE8x8_MSA;
+  VP8SSE4x4 = SSE4x4_MSA;
+
+  VP8EncQuantizeBlock = QuantizeBlock_MSA;
+  VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
+  VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/enc_neon.c b/thirdparty/libwebp/src/dsp/enc_neon.c
index 6a078d632d..43bf1245c5 100644
--- a/thirdparty/libwebp/dsp/enc_neon.c
+++ b/thirdparty/libwebp/src/dsp/enc_neon.c
@@ -11,14 +11,14 @@
 //
 // adapted from libvpx (http://www.webmproject.org/code/)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <assert.h>
 
-#include "./neon.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/neon.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -37,15 +37,15 @@ static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 #if defined(WEBP_USE_INTRINSICS)
 
 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
+static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
 }
 
 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 // to the corresponding rows of 'dst'.
-static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
-                                            const int16x8_t dst01,
-                                            const int16x8_t dst23) {
+static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
+                                                 const int16x8_t dst01,
+                                                 const int16x8_t dst23) {
   // Unsigned saturate to 8b.
   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
@@ -57,8 +57,10 @@ static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 }
 
-static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
-                               const uint8_t* const ref, uint8_t* const dst) {
+static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
+                                    const int16x8_t row23,
+                                    const uint8_t* const ref,
+                                    uint8_t* const dst) {
   uint32x2_t dst01 = vdup_n_u32(0);
   uint32x2_t dst23 = vdup_n_u32(0);
 
@@ -70,19 +72,20 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
 
   {
     // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
-    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
+    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
 
     // Descale with rounding.
     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
     // Add the inverse transform.
-    SaturateAndStore4x4(dst, out01, out23);
+    SaturateAndStore4x4_NEON(dst, out01, out23);
   }
 }
 
-static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
-                                     int16x8x2_t* const out) {
+static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
+                                          const int16x8_t in1,
+                                          int16x8x2_t* const out) {
   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
@@ -90,7 +93,7 @@ static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 }
 
-static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
+static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   // {rows} = in0 | in4
   //          in8 | in12
   // B1 = in4 | in12
@@ -113,22 +116,22 @@ static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
-  Transpose8x2(E0, E1, rows);
+  Transpose8x2_NEON(E0, E1, rows);
 }
 
-static void ITransformOne(const uint8_t* ref,
-                          const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
   int16x8x2_t rows;
   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
-  TransformPass(&rows);
-  TransformPass(&rows);
-  Add4x4(rows.val[0], rows.val[1], ref, dst);
+  TransformPass_NEON(&rows);
+  TransformPass_NEON(&rows);
+  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
 }
 
 #else
 
-static void ITransformOne(const uint8_t* ref,
-                          const int16_t* in, uint8_t* dst) {
+static void ITransformOne_NEON(const uint8_t* ref,
+                               const int16_t* in, uint8_t* dst) {
   const int kBPS = BPS;
   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
 
@@ -243,16 +246,16 @@ static void ITransformOne(const uint8_t* ref,
 
 #endif    // WEBP_USE_INTRINSICS
 
-static void ITransform(const uint8_t* ref,
-                       const int16_t* in, uint8_t* dst, int do_two) {
-  ITransformOne(ref, in, dst);
+static void ITransform_NEON(const uint8_t* ref,
+                            const int16_t* in, uint8_t* dst, int do_two) {
+  ITransformOne_NEON(ref, in, dst);
   if (do_two) {
-    ITransformOne(ref + 4, in + 16, dst + 4);
+    ITransformOne_NEON(ref + 4, in + 16, dst + 4);
   }
 }
 
 // Load all 4x4 pixels into a single uint8x16_t variable.
-static uint8x16_t Load4x4(const uint8_t* src) {
+static uint8x16_t Load4x4_NEON(const uint8_t* src) {
   uint32x4_t out = vdupq_n_u32(0);
   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
@@ -265,10 +268,12 @@ static uint8x16_t Load4x4(const uint8_t* src) {
 
 #if defined(WEBP_USE_INTRINSICS)
 
-static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
-                                         const int16x4_t C, const int16x4_t D,
-                                         int16x8_t* const out01,
-                                         int16x8_t* const out32) {
+static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
+                                              const int16x4_t B,
+                                              const int16x4_t C,
+                                              const int16x4_t D,
+                                              int16x8_t* const out01,
+                                              int16x8_t* const out32) {
   const int16x4x2_t AB = vtrn_s16(A, B);
   const int16x4x2_t CD = vtrn_s16(C, D);
   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
@@ -283,24 +288,24 @@ static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
                    vreinterpret_s64_s32(tmp02.val[1])));
 }
 
-static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
-                                         const uint8x8_t b) {
+static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
+                                              const uint8x8_t b) {
   return vreinterpretq_s16_u16(vsubl_u8(a, b));
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref,
-                       int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
   {
-    const uint8x16_t S0 = Load4x4(src);
-    const uint8x16_t R0 = Load4x4(ref);
-    const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
-    const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
+    const uint8x16_t S0 = Load4x4_NEON(src);
+    const uint8x16_t R0 = Load4x4_NEON(ref);
+    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
+    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
     const int16x4_t D0 = vget_low_s16(D0D1);
     const int16x4_t D1 = vget_high_s16(D0D1);
     const int16x4_t D2 = vget_low_s16(D2D3);
     const int16x4_t D3 = vget_high_s16(D2D3);
-    Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
+    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
   }
   {    // 1rst pass
     const int32x4_t kCst937 = vdupq_n_s32(937);
@@ -318,7 +323,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
-    Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
+    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
   }
   {    // 2nd pass
     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
@@ -358,8 +363,8 @@ static const int32_t kCoeff32[] = {
   51000, 51000, 51000, 51000
 };
 
-static void FTransform(const uint8_t* src, const uint8_t* ref,
-                       int16_t* out) {
+static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
   const int kBPS = BPS;
   const uint8_t* src_ptr = src;
   const uint8_t* ref_ptr = ref;
@@ -478,7 +483,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref,
   src += stride;                                    \
 } while (0)
 
-static void FTransformWHT(const int16_t* src, int16_t* out) {
+static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
   const int stride = 16;
   const int16x4_t zero = vdup_n_s16(0);
   int32x4x4_t tmp0;
@@ -516,7 +521,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
     tmp0.val[3] = vsubq_s32(a0, a1);
   }
   {
-    const int32x4x4_t tmp1 = Transpose4x4(tmp0);
+    const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
     // a0 = tmp[0 + i] + tmp[ 8 + i]
     // a1 = tmp[4 + i] + tmp[12 + i]
     // a2 = tmp[4 + i] - tmp[12 + i]
@@ -560,7 +565,7 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
 // a 26ae, b 26ae
 // a 37bf, b 37bf
 //
-static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
   const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
@@ -574,7 +579,8 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
   return q4_in;
 }
 
-static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
+    const int16x8x4_t q4_in) {
   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
   const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
@@ -593,7 +599,7 @@ static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
   return q4_out;
 }
 
-static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
+static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
   const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
                                                         q4_in.val[2]));
   const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
@@ -610,7 +616,7 @@ static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
   return q4_out;
 }
 
-static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
+static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
   const uint16x8_t q_w07 = vld1q_u16(&w[0]);
   const uint16x8_t q_w8f = vld1q_u16(&w[8]);
   int16x4x4_t d4_w;
@@ -622,8 +628,8 @@ static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
   return d4_w;
 }
 
-static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
-                                      const int16x4x4_t d4_w) {
+static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
+                                           const int16x4x4_t d4_w) {
   int32x2_t d_sum;
   // sum += w[ 0] * abs(b0);
   // sum += w[ 4] * abs(b1);
@@ -652,8 +658,8 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
+static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
@@ -679,12 +685,12 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
     // Vertical pass first to avoid a transpose (vertical and horizontal passes
     // are commutative because w/kWeightY is symmetric) and subsequent
     // transpose.
-    const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
-    const int16x4x4_t d4_w = DistoLoadW(w);
+    const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
+    const int16x4x4_t d4_w = DistoLoadW_NEON(w);
     // horizontal pass
-    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
-    const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
-    int32x2_t d_sum = DistoSum(q4_h, d4_w);
+    const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
+    const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
+    int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
 
     // abs(sum2 - sum1) >> 5
     d_sum = vabs_s32(d_sum);
@@ -694,13 +700,13 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 }
 #undef LOAD_LANE_32b
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_NEON(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -708,15 +714,15 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 
 //------------------------------------------------------------------------------
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
     int16_t out[16];
-    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
     {
       int k;
       const int16x8_t a0 = vld1q_s16(out + 0);
@@ -740,9 +746,9 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 
 //------------------------------------------------------------------------------
 
-static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
-                                        const uint8_t* const b,
-                                        uint32x4_t* const sum) {
+static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
+                                             const uint8_t* const b,
+                                             uint32x4_t* const sum) {
   const uint8x16_t a0 = vld1q_u8(a);
   const uint8x16_t b0 = vld1q_u8(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
@@ -757,7 +763,7 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
 }
 
 // Horizontal sum of all four uint32_t values in 'sum'.
-static int SumToInt(uint32x4_t sum) {
+static int SumToInt_NEON(uint32x4_t sum) {
   const uint64x2_t sum2 = vpaddlq_u32(sum);
   const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
   return (int)sum3;
@@ -767,18 +773,18 @@ static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 16; ++y) {
-    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
   }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }
 
 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
-    AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
+    AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
   }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }
 
 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
@@ -791,12 +797,12 @@ static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
     sum = vpadalq_u16(sum, prod);
   }
-  return SumToInt(sum);
+  return SumToInt_NEON(sum);
 }
 
 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
-  const uint8x16_t a0 = Load4x4(a);
-  const uint8x16_t b0 = Load4x4(b);
+  const uint8x16_t a0 = Load4x4_NEON(a);
+  const uint8x16_t b0 = Load4x4_NEON(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
                                     vget_low_u8(abs_diff));
@@ -805,7 +811,7 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
   /* pair-wise adds and widen */
   const uint32x4_t sum1 = vpaddlq_u16(prod1);
   const uint32x4_t sum2 = vpaddlq_u16(prod2);
-  return SumToInt(vaddq_u32(sum1, sum2));
+  return SumToInt_NEON(vaddq_u32(sum1, sum2));
 }
 
 //------------------------------------------------------------------------------
@@ -813,8 +819,8 @@ static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
 // Compilation with gcc-4.6.x is problematic for now.
 #if !defined(WORK_AROUND_GCC)
 
-static int16x8_t Quantize(int16_t* const in,
-                          const VP8Matrix* const mtx, int offset) {
+static int16x8_t Quantize_NEON(int16_t* const in,
+                               const VP8Matrix* const mtx, int offset) {
   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
@@ -847,10 +853,10 @@ static const uint8_t kShuffles[4][8] = {
   { 14, 15, 22, 23, 28, 29, 30, 31 }
 };
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  const int16x8_t out0 = Quantize(in, mtx, 0);
-  const int16x8_t out1 = Quantize(in, mtx, 8);
+static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
+  const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
   uint8x8x4_t shuffles;
   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
   // non-standard versions there.
@@ -889,11 +895,11 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
   int nz;
-  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
-  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
   return nz;
 }
 
@@ -905,14 +911,14 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
+  VP8ITransform = ITransform_NEON;
+  VP8FTransform = FTransform_NEON;
 
-  VP8FTransformWHT = FTransformWHT;
+  VP8FTransformWHT = FTransformWHT_NEON;
 
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8CollectHistogram = CollectHistogram;
+  VP8TDisto4x4 = Disto4x4_NEON;
+  VP8TDisto16x16 = Disto16x16_NEON;
+  VP8CollectHistogram = CollectHistogram_NEON;
 
   VP8SSE16x16 = SSE16x16_NEON;
   VP8SSE16x8 = SSE16x8_NEON;
@@ -920,8 +926,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
   VP8SSE4x4 = SSE4x4_NEON;
 
 #if !defined(WORK_AROUND_GCC)
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlock = QuantizeBlock_NEON;
+  VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
 #endif
 }
 
diff --git a/thirdparty/libwebp/dsp/enc_sse2.c b/thirdparty/libwebp/src/dsp/enc_sse2.c
index 2026a74c91..7b3f142c31 100644
--- a/thirdparty/libwebp/dsp/enc_sse2.c
+++ b/thirdparty/libwebp/src/dsp/enc_sse2.c
@@ -11,23 +11,23 @@
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <assert.h>
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
-#include "./common_sse2.h"
-#include "../enc/cost_enc.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 // Does one or two inverse transforms.
-static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
-                       int do_two) {
+static void ITransform_SSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                            int do_two) {
   // This implementation makes use of 16-bit fixed point versions of two
   // multiply constants:
   //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -193,10 +193,10 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   }
 }
 
-static void FTransformPass1(const __m128i* const in01,
-                            const __m128i* const in23,
-                            __m128i* const out01,
-                            __m128i* const out32) {
+static void FTransformPass1_SSE2(const __m128i* const in01,
+                                 const __m128i* const in23,
+                                 __m128i* const out01,
+                                 __m128i* const out32) {
   const __m128i k937 = _mm_set1_epi32(937);
   const __m128i k1812 = _mm_set1_epi32(1812);
 
@@ -239,8 +239,9 @@ static void FTransformPass1(const __m128i* const in01,
   *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
 }
 
-static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
-                            int16_t* out) {
+static void FTransformPass2_SSE2(const __m128i* const v01,
+                                 const __m128i* const v32,
+                                 int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i seven = _mm_set1_epi16(7);
   const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
@@ -291,7 +292,8 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
   _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform_SSE2(const uint8_t* src, const uint8_t* ref,
+                            int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
   // Load src.
   const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
@@ -328,13 +330,14 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   __m128i v01, v32;
 
   // First pass
-  FTransformPass1(&row01, &row23, &v01, &v32);
+  FTransformPass1_SSE2(&row01, &row23, &v01, &v32);
 
   // Second pass
-  FTransformPass2(&v01, &v32, out);
+  FTransformPass2_SSE2(&v01, &v32, out);
 }
 
-static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+static void FTransform2_SSE2(const uint8_t* src, const uint8_t* ref,
+                             int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
 
   // Load src and convert to 16b.
@@ -374,15 +377,15 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   __m128i v01h, v32h;
 
   // First pass
-  FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
-  FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
+  FTransformPass1_SSE2(&shuf01l, &shuf23l, &v01l, &v32l);
+  FTransformPass1_SSE2(&shuf01h, &shuf23h, &v01h, &v32h);
 
   // Second pass
-  FTransformPass2(&v01l, &v32l, out + 0);
-  FTransformPass2(&v01h, &v32h, out + 16);
+  FTransformPass2_SSE2(&v01l, &v32l, out + 0);
+  FTransformPass2_SSE2(&v01h, &v32h, out + 16);
 }
 
-static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
+static void FTransformWHTRow_SSE2(const int16_t* const in, __m128i* const out) {
   const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
   const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
   const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
@@ -398,14 +401,14 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
   *out = _mm_madd_epi16(D, kMult);
 }
 
-static void FTransformWHT(const int16_t* in, int16_t* out) {
+static void FTransformWHT_SSE2(const int16_t* in, int16_t* out) {
   // Input is 12b signed.
   __m128i row0, row1, row2, row3;
   // Rows are 14b signed.
-  FTransformWHTRow(in + 0 * 64, &row0);
-  FTransformWHTRow(in + 1 * 64, &row1);
-  FTransformWHTRow(in + 2 * 64, &row2);
-  FTransformWHTRow(in + 3 * 64, &row3);
+  FTransformWHTRow_SSE2(in + 0 * 64, &row0);
+  FTransformWHTRow_SSE2(in + 1 * 64, &row1);
+  FTransformWHTRow_SSE2(in + 2 * 64, &row2);
+  FTransformWHTRow_SSE2(in + 3 * 64, &row3);
 
   {
     // The a* are 15b signed.
@@ -431,9 +434,9 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 // Compute susceptibility based on DCT-coeff histograms:
 // the higher, the "easier" the macroblock is to compress.
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_SSE2(const uint8_t* ref, const uint8_t* pred,
+                                  int start_block, int end_block,
+                                  VP8Histogram* const histo) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   int j;
@@ -442,7 +445,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
     int16_t out[16];
     int k;
 
-    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    FTransform_SSE2(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 
     // Convert coefficients to bin (within out[]).
     {
@@ -476,7 +479,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // Intra predictions
 
 // helper for chroma-DC predictions
-static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put8x8uv_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 8; ++j) {
@@ -484,7 +487,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
   }
 }
 
-static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+static WEBP_INLINE void Put16_SSE2(uint8_t v, uint8_t* dst) {
   int j;
   const __m128i values = _mm_set1_epi8(v);
   for (j = 0; j < 16; ++j) {
@@ -492,20 +495,20 @@ static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
   }
 }
 
-static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+static WEBP_INLINE void Fill_SSE2(uint8_t* dst, int value, int size) {
   if (size == 4) {
     int j;
     for (j = 0; j < 4; ++j) {
       memset(dst + j * BPS, value, 4);
     }
   } else if (size == 8) {
-    Put8x8uv(value, dst);
+    Put8x8uv_SSE2(value, dst);
   } else {
-    Put16(value, dst);
+    Put16_SSE2(value, dst);
   }
 }
 
-static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE8uv_SSE2(uint8_t* dst, const uint8_t* top) {
   int j;
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   for (j = 0; j < 8; ++j) {
@@ -513,7 +516,7 @@ static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
   }
 }
 
-static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void VE16_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i top_values = _mm_load_si128((const __m128i*)top);
   int j;
   for (j = 0; j < 16; ++j) {
@@ -521,20 +524,20 @@ static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
   }
 }
 
-static WEBP_INLINE void VerticalPred(uint8_t* dst,
-                                     const uint8_t* top, int size) {
+static WEBP_INLINE void VerticalPred_SSE2(uint8_t* dst,
+                                          const uint8_t* top, int size) {
   if (top != NULL) {
     if (size == 8) {
-      VE8uv(dst, top);
+      VE8uv_SSE2(dst, top);
     } else {
-      VE16(dst, top);
+      VE16_SSE2(dst, top);
     }
   } else {
-    Fill(dst, 127, size);
+    Fill_SSE2(dst, 127, size);
   }
 }
 
-static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE8uv_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 8; ++j) {
     const __m128i values = _mm_set1_epi8(left[j]);
@@ -543,7 +546,7 @@ static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
   }
 }
 
-static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void HE16_SSE2(uint8_t* dst, const uint8_t* left) {
   int j;
   for (j = 0; j < 16; ++j) {
     const __m128i values = _mm_set1_epi8(left[j]);
@@ -552,21 +555,21 @@ static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
   }
 }
 
-static WEBP_INLINE void HorizontalPred(uint8_t* dst,
-                                       const uint8_t* left, int size) {
+static WEBP_INLINE void HorizontalPred_SSE2(uint8_t* dst,
+                                            const uint8_t* left, int size) {
   if (left != NULL) {
     if (size == 8) {
-      HE8uv(dst, left);
+      HE8uv_SSE2(dst, left);
     } else {
-      HE16(dst, left);
+      HE16_SSE2(dst, left);
     }
   } else {
-    Fill(dst, 129, size);
+    Fill_SSE2(dst, 129, size);
   }
 }
 
-static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
-                           const uint8_t* top, int size) {
+static WEBP_INLINE void TM_SSE2(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top, int size) {
   const __m128i zero = _mm_setzero_si128();
   int y;
   if (size == 8) {
@@ -593,13 +596,13 @@ static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
   }
 }
 
-static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
-                                   const uint8_t* top, int size) {
+static WEBP_INLINE void TrueMotion_SSE2(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top, int size) {
   if (left != NULL) {
     if (top != NULL) {
-      TM(dst, left, top, size);
+      TM_SSE2(dst, left, top, size);
     } else {
-      HorizontalPred(dst, left, size);
+      HorizontalPred_SSE2(dst, left, size);
     }
   } else {
     // true motion without left samples (hence: with default 129 value)
@@ -607,90 +610,90 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
     // Note that if top samples are not available, the default value is
     // then 129, and not 127 as in the VerticalPred case.
     if (top != NULL) {
-      VerticalPred(dst, top, size);
+      VerticalPred_SSE2(dst, top, size);
     } else {
-      Fill(dst, 129, size);
+      Fill_SSE2(dst, 129, size);
     }
   }
 }
 
-static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
-                              const uint8_t* top) {
+static WEBP_INLINE void DC8uv_SSE2(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top) {
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
   const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
   const int DC = VP8HorizontalAdd8b(&combined) + 8;
-  Put8x8uv(DC >> 4, dst);
+  Put8x8uv_SSE2(DC >> 4, dst);
 }
 
-static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC8uvNoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i sum = _mm_sad_epu8(top_values, zero);
   const int DC = _mm_cvtsi128_si32(sum) + 4;
-  Put8x8uv(DC >> 3, dst);
+  Put8x8uv_SSE2(DC >> 3, dst);
 }
 
-static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC8uvNoTop_SSE2(uint8_t* dst, const uint8_t* left) {
   // 'left' is contiguous so we can reuse the top summation.
-  DC8uvNoLeft(dst, left);
+  DC8uvNoLeft_SSE2(dst, left);
 }
 
-static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
-  Put8x8uv(0x80, dst);
+static WEBP_INLINE void DC8uvNoTopLeft_SSE2(uint8_t* dst) {
+  Put8x8uv_SSE2(0x80, dst);
 }
 
-static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
-                                  const uint8_t* top) {
+static WEBP_INLINE void DC8uvMode_SSE2(uint8_t* dst, const uint8_t* left,
+                                       const uint8_t* top) {
   if (top != NULL) {
     if (left != NULL) {  // top and left present
-      DC8uv(dst, left, top);
+      DC8uv_SSE2(dst, left, top);
     } else {  // top, but no left
-      DC8uvNoLeft(dst, top);
+      DC8uvNoLeft_SSE2(dst, top);
     }
   } else if (left != NULL) {  // left but no top
-    DC8uvNoTop(dst, left);
+    DC8uvNoTop_SSE2(dst, left);
   } else {  // no top, no left, nothing.
-    DC8uvNoTopLeft(dst);
+    DC8uvNoTopLeft_SSE2(dst);
   }
 }
 
-static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static WEBP_INLINE void DC16_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const __m128i left_row = _mm_load_si128((const __m128i*)left);
   const int DC =
       VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
-  Put16(DC >> 5, dst);
+  Put16_SSE2(DC >> 5, dst);
 }
 
-static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC16NoLeft_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const int DC = VP8HorizontalAdd8b(&top_row) + 8;
-  Put16(DC >> 4, dst);
+  Put16_SSE2(DC >> 4, dst);
 }
 
-static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
+static WEBP_INLINE void DC16NoTop_SSE2(uint8_t* dst, const uint8_t* left) {
   // 'left' is contiguous so we can reuse the top summation.
-  DC16NoLeft(dst, left);
+  DC16NoLeft_SSE2(dst, left);
 }
 
-static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
-  Put16(0x80, dst);
+static WEBP_INLINE void DC16NoTopLeft_SSE2(uint8_t* dst) {
+  Put16_SSE2(0x80, dst);
 }
 
-static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
-                                 const uint8_t* top) {
+static WEBP_INLINE void DC16Mode_SSE2(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
   if (top != NULL) {
     if (left != NULL) {  // top and left present
-      DC16(dst, left, top);
+      DC16_SSE2(dst, left, top);
     } else {  // top, but no left
-      DC16NoLeft(dst, top);
+      DC16NoLeft_SSE2(dst, top);
     }
   } else if (left != NULL) {  // left but no top
-    DC16NoTop(dst, left);
+    DC16NoTop_SSE2(dst, left);
   } else {  // no top, no left, nothing.
-    DC16NoTopLeft(dst);
+    DC16NoTopLeft_SSE2(dst);
   }
 }
 
@@ -709,7 +712,8 @@ static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
 //   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
 //   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
 
-static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
+static WEBP_INLINE void VE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // vertical
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -725,7 +729,8 @@ static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
   }
 }
 
-static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
+static WEBP_INLINE void HE4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // horizontal
   const int X = top[-1];
   const int I = top[-2];
   const int J = top[-3];
@@ -737,14 +742,15 @@ static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
 
-static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void DC4_SSE2(uint8_t* dst, const uint8_t* top) {
   uint32_t dc = 4;
   int i;
   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
-  Fill(dst, dc >> 3, 4);
+  Fill_SSE2(dst, dc >> 3, 4);
 }
 
-static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
+static WEBP_INLINE void LD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
@@ -760,8 +766,8 @@ static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
   WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
-static WEBP_INLINE void VR4(uint8_t* dst,
-                            const uint8_t* top) {  // Vertical-Right
+static WEBP_INLINE void VR4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Right
   const __m128i one = _mm_set1_epi8(1);
   const int I = top[-2];
   const int J = top[-3];
@@ -786,8 +792,8 @@ static WEBP_INLINE void VR4(uint8_t* dst,
   DST(0, 3) = AVG3(K, J, I);
 }
 
-static WEBP_INLINE void VL4(uint8_t* dst,
-                            const uint8_t* top) {  // Vertical-Left
+static WEBP_INLINE void VL4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Vertical-Left
   const __m128i one = _mm_set1_epi8(1);
   const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
   const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
@@ -812,7 +818,8 @@ static WEBP_INLINE void VL4(uint8_t* dst,
   DST(3, 3) = (extra_out >> 8) & 0xff;
 }
 
-static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
+static WEBP_INLINE void RD4_SSE2(uint8_t* dst,
+                                 const uint8_t* top) {  // Down-right
   const __m128i one = _mm_set1_epi8(1);
   const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
   const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
@@ -828,7 +835,7 @@ static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
   WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
 }
 
-static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HU4_SSE2(uint8_t* dst, const uint8_t* top) {
   const int I = top[-2];
   const int J = top[-3];
   const int K = top[-4];
@@ -843,7 +850,7 @@ static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
-static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void HD4_SSE2(uint8_t* dst, const uint8_t* top) {
   const int X = top[-1];
   const int I = top[-2];
   const int J = top[-3];
@@ -866,7 +873,7 @@ static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
   DST(1, 3)             = AVG3(L, K, J);
 }
 
-static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+static WEBP_INLINE void TM4_SSE2(uint8_t* dst, const uint8_t* top) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
   const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
@@ -888,55 +895,56 @@ static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
 
 // Left samples are top[-5 .. -2], top_left is top[-1], top are
 // located at top[0..3], and top right is top[4..7]
-static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
-  DC4(I4DC4 + dst, top);
-  TM4(I4TM4 + dst, top);
-  VE4(I4VE4 + dst, top);
-  HE4(I4HE4 + dst, top);
-  RD4(I4RD4 + dst, top);
-  VR4(I4VR4 + dst, top);
-  LD4(I4LD4 + dst, top);
-  VL4(I4VL4 + dst, top);
-  HD4(I4HD4 + dst, top);
-  HU4(I4HU4 + dst, top);
+static void Intra4Preds_SSE2(uint8_t* dst, const uint8_t* top) {
+  DC4_SSE2(I4DC4 + dst, top);
+  TM4_SSE2(I4TM4 + dst, top);
+  VE4_SSE2(I4VE4 + dst, top);
+  HE4_SSE2(I4HE4 + dst, top);
+  RD4_SSE2(I4RD4 + dst, top);
+  VR4_SSE2(I4VR4 + dst, top);
+  LD4_SSE2(I4LD4 + dst, top);
+  VL4_SSE2(I4VL4 + dst, top);
+  HD4_SSE2(I4HD4 + dst, top);
+  HU4_SSE2(I4HU4 + dst, top);
 }
 
 //------------------------------------------------------------------------------
 // Chroma 8x8 prediction (paragraph 12.2)
 
-static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
-                             const uint8_t* top) {
+static void IntraChromaPreds_SSE2(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
   // U block
-  DC8uvMode(C8DC8 + dst, left, top);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
   // V block
   dst += 8;
   if (top != NULL) top += 8;
   if (left != NULL) left += 16;
-  DC8uvMode(C8DC8 + dst, left, top);
-  VerticalPred(C8VE8 + dst, top, 8);
-  HorizontalPred(C8HE8 + dst, left, 8);
-  TrueMotion(C8TM8 + dst, left, top, 8);
+  DC8uvMode_SSE2(C8DC8 + dst, left, top);
+  VerticalPred_SSE2(C8VE8 + dst, top, 8);
+  HorizontalPred_SSE2(C8HE8 + dst, left, 8);
+  TrueMotion_SSE2(C8TM8 + dst, left, top, 8);
 }
 
 //------------------------------------------------------------------------------
 // luma 16x16 prediction (paragraph 12.3)
 
-static void Intra16Preds(uint8_t* dst,
-                         const uint8_t* left, const uint8_t* top) {
-  DC16Mode(I16DC16 + dst, left, top);
-  VerticalPred(I16VE16 + dst, top, 16);
-  HorizontalPred(I16HE16 + dst, left, 16);
-  TrueMotion(I16TM16 + dst, left, top, 16);
+static void Intra16Preds_SSE2(uint8_t* dst,
+                              const uint8_t* left, const uint8_t* top) {
+  DC16Mode_SSE2(I16DC16 + dst, left, top);
+  VerticalPred_SSE2(I16VE16 + dst, top, 16);
+  HorizontalPred_SSE2(I16HE16 + dst, left, 16);
+  TrueMotion_SSE2(I16TM16 + dst, left, top, 16);
 }
 
 //------------------------------------------------------------------------------
 // Metric
 
-static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
-                                              __m128i* const sum) {
+static WEBP_INLINE void SubtractAndAccumulate_SSE2(const __m128i a,
+                                                   const __m128i b,
+                                                   __m128i* const sum) {
   // take abs(a-b) in 8b
   const __m128i a_b = _mm_subs_epu8(a, b);
   const __m128i b_a = _mm_subs_epu8(b, a);
@@ -951,8 +959,8 @@ static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
   *sum = _mm_add_epi32(sum1, sum2);
 }
 
-static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
-                                int num_pairs) {
+static WEBP_INLINE int SSE_16xN_SSE2(const uint8_t* a, const uint8_t* b,
+                                     int num_pairs) {
   __m128i sum = _mm_setzero_si128();
   int32_t tmp[4];
   int i;
@@ -963,8 +971,8 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
     const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
     const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
     __m128i sum1, sum2;
-    SubtractAndAccumulate(a0, b0, &sum1);
-    SubtractAndAccumulate(a1, b1, &sum2);
+    SubtractAndAccumulate_SSE2(a0, b0, &sum1);
+    SubtractAndAccumulate_SSE2(a1, b1, &sum2);
     sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
     a += 2 * BPS;
     b += 2 * BPS;
@@ -973,18 +981,18 @@ static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN(a, b, 8);
+static int SSE16x16_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 8);
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  return SSE_16xN(a, b, 4);
+static int SSE16x8_SSE2(const uint8_t* a, const uint8_t* b) {
+  return SSE_16xN_SSE2(a, b, 4);
 }
 
 #define LOAD_8x16b(ptr) \
   _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_SSE2(const uint8_t* a, const uint8_t* b) {
   const __m128i zero = _mm_setzero_si128();
   int num_pairs = 4;
   __m128i sum = zero;
@@ -1011,7 +1019,7 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
 }
 #undef LOAD_8x16b
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_SSE2(const uint8_t* a, const uint8_t* b) {
   const __m128i zero = _mm_setzero_si128();
 
   // Load values. Note that we read 8 pixels instead of 4,
@@ -1048,7 +1056,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 
 //------------------------------------------------------------------------------
 
-static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+static void Mean16x4_SSE2(const uint8_t* ref, uint32_t dc[4]) {
   const __m128i mask = _mm_set1_epi16(0x00ff);
   const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
   const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
@@ -1086,8 +1094,8 @@ static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
-                      const uint16_t* const w) {
+static int TTransform_SSE2(const uint8_t* inA, const uint8_t* inB,
+                           const uint16_t* const w) {
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   const __m128i zero = _mm_setzero_si128();
@@ -1187,19 +1195,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
   return sum[0] + sum[1] + sum[2] + sum[3];
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE2(const uint8_t* const a, const uint8_t* const b,
+                         const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE2(a, b, w);
   return abs(diff_sum) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_SSE2(const uint8_t* const a, const uint8_t* const b,
+                           const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_SSE2(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -1209,9 +1217,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 // Quantization
 //
 
-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-                                       const uint16_t* const sharpen,
-                                       const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                                            const uint16_t* const sharpen,
+                                            const VP8Matrix* const mtx) {
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
   const __m128i zero = _mm_setzero_si128();
   __m128i coeff0, coeff8;
@@ -1321,22 +1329,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
   return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
 }
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE2(int16_t in[16], int16_t out[16],
+                              const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, &mtx->sharpen_[0], mtx);
 }
 
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE2(int16_t in[16], int16_t out[16],
+                                 const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE2(in, out, NULL, mtx);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE2(int16_t in[32], int16_t out[32],
+                                const VP8Matrix* const mtx) {
   int nz;
   const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock_SSE2(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE2(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
   return nz;
 }
 
@@ -1346,139 +1354,28 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 extern void VP8EncDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
-  VP8CollectHistogram = CollectHistogram;
-  VP8EncPredLuma16 = Intra16Preds;
-  VP8EncPredChroma8 = IntraChromaPreds;
-  VP8EncPredLuma4 = Intra4Preds;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
-  VP8ITransform = ITransform;
-  VP8FTransform = FTransform;
-  VP8FTransform2 = FTransform2;
-  VP8FTransformWHT = FTransformWHT;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
-  VP8Mean16x4 = Mean16x4;
-}
-
-//------------------------------------------------------------------------------
-// SSIM / PSNR entry point (TODO(skal): move to its own file later)
-
-static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
-                                   const uint8_t* src2, int len) {
-  int i = 0;
-  uint32_t sse2 = 0;
-  if (len >= 16) {
-    const int limit = len - 32;
-    int32_t tmp[4];
-    __m128i sum1;
-    __m128i sum = _mm_setzero_si128();
-    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-    i += 16;
-    while (i <= limit) {
-      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      __m128i sum2;
-      i += 16;
-      SubtractAndAccumulate(a0, b0, &sum1);
-      sum = _mm_add_epi32(sum, sum1);
-      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
-      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
-      i += 16;
-      SubtractAndAccumulate(a1, b1, &sum2);
-      sum = _mm_add_epi32(sum, sum2);
-    }
-    SubtractAndAccumulate(a0, b0, &sum1);
-    sum = _mm_add_epi32(sum, sum1);
-    _mm_storeu_si128((__m128i*)tmp, sum);
-    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
-  }
-
-  for (; i < len; ++i) {
-    const int32_t diff = src1[i] - src2[i];
-    sse2 += diff * diff;
-  }
-  return sse2;
-}
-
-static uint32_t HorizontalAdd16b(const __m128i* const m) {
-  uint16_t tmp[8];
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi16(*m, a);
-  _mm_storeu_si128((__m128i*)tmp, b);
-  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
-}
-
-static uint32_t HorizontalAdd32b(const __m128i* const m) {
-  const __m128i a = _mm_srli_si128(*m, 8);
-  const __m128i b = _mm_add_epi32(*m, a);
-  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
-  return (uint32_t)_mm_cvtsi128_si32(c);
-}
-
-static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
-
-#define ACCUMULATE_ROW(WEIGHT) do {                         \
-  /* compute row weight (Wx * Wy) */                        \
-  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
-  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
-  /* process 8 bytes at a time (7 bytes, actually) */       \
-  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
-  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
-  /* convert to 16b and multiply by weight */               \
-  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
-  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
-  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
-  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
-  /* accumulate */                                          \
-  xm  = _mm_add_epi16(xm, wa1);                             \
-  ym  = _mm_add_epi16(ym, wb1);                             \
-  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
-  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
-  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
-  src1 += stride1;                                          \
-  src2 += stride2;                                          \
-} while (0)
-
-static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
-                           const uint8_t* src2, int stride2) {
-  VP8DistoStats stats;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i xm = zero, ym = zero;                // 16b accums
-  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
-  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
-  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
-  ACCUMULATE_ROW(1);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(4);
-  ACCUMULATE_ROW(3);
-  ACCUMULATE_ROW(2);
-  ACCUMULATE_ROW(1);
-  stats.xm  = HorizontalAdd16b(&xm);
-  stats.ym  = HorizontalAdd16b(&ym);
-  stats.xxm = HorizontalAdd32b(&xxm);
-  stats.xym = HorizontalAdd32b(&xym);
-  stats.yym = HorizontalAdd32b(&yym);
-  return VP8SSIMFromStats(&stats);
-}
-
-extern void VP8SSIMDspInitSSE2(void);
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
-  VP8AccumulateSSE = AccumulateSSE_SSE2;
-  VP8SSIMGet = SSIMGet_SSE2;
+  VP8CollectHistogram = CollectHistogram_SSE2;
+  VP8EncPredLuma16 = Intra16Preds_SSE2;
+  VP8EncPredChroma8 = IntraChromaPreds_SSE2;
+  VP8EncPredLuma4 = Intra4Preds_SSE2;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE2;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE2;
+  VP8ITransform = ITransform_SSE2;
+  VP8FTransform = FTransform_SSE2;
+  VP8FTransform2 = FTransform2_SSE2;
+  VP8FTransformWHT = FTransformWHT_SSE2;
+  VP8SSE16x16 = SSE16x16_SSE2;
+  VP8SSE16x8 = SSE16x8_SSE2;
+  VP8SSE8x8 = SSE8x8_SSE2;
+  VP8SSE4x4 = SSE4x4_SSE2;
+  VP8TDisto4x4 = Disto4x4_SSE2;
+  VP8TDisto16x16 = Disto16x16_SSE2;
+  VP8Mean16x4 = Mean16x4_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
-WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/enc_sse41.c b/thirdparty/libwebp/src/dsp/enc_sse41.c
index e32086d9fd..924035a644 100644
--- a/thirdparty/libwebp/dsp/enc_sse41.c
+++ b/thirdparty/libwebp/src/dsp/enc_sse41.c
@@ -11,21 +11,21 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 #include <smmintrin.h>
 #include <stdlib.h>  // for abs()
 
-#include "./common_sse2.h"
-#include "../enc/vp8i_enc.h"
+#include "src/dsp/common_sse2.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.
 
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
+static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
+                                   int start_block, int end_block,
+                                   VP8Histogram* const histo) {
   const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
   int j;
   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
@@ -70,8 +70,8 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 // w[] contains a row-major 4 by 4 symmetric matrix.
-static int TTransform(const uint8_t* inA, const uint8_t* inB,
-                      const uint16_t* const w) {
+static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
+                            const uint16_t* const w) {
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
 
@@ -168,19 +168,19 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
   return sum[0] + sum[1] + sum[2] + sum[3];
 }
 
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int diff_sum = TTransform(a, b, w);
+static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
+                          const uint16_t* const w) {
+  const int diff_sum = TTransform_SSE41(a, b, w);
   return abs(diff_sum) >> 5;
 }
 
-static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
-                      const uint16_t* const w) {
+static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
+                            const uint16_t* const w) {
   int D = 0;
   int x, y;
   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
     for (x = 0; x < 16; x += 4) {
-      D += Disto4x4(a + x + y, b + x + y, w);
+      D += Disto4x4_SSE41(a + x + y, b + x + y, w);
     }
   }
   return D;
@@ -197,9 +197,9 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
                2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
 
-static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
-                                       const uint16_t* const sharpen,
-                                       const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                                             const uint16_t* const sharpen,
+                                             const VP8Matrix* const mtx) {
   const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
   const __m128i zero = _mm_setzero_si128();
   __m128i out0, out8;
@@ -300,22 +300,22 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
 
 #undef PSHUFB_CST
 
-static int QuantizeBlock(int16_t in[16], int16_t out[16],
-                         const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
+                               const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
 }
 
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  return DoQuantizeBlock(in, out, NULL, mtx);
+static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
+                                  const VP8Matrix* const mtx) {
+  return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
 }
 
-static int Quantize2Blocks(int16_t in[32], int16_t out[32],
-                           const VP8Matrix* const mtx) {
+static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
+                                 const VP8Matrix* const mtx) {
   int nz;
   const uint16_t* const sharpen = &mtx->sharpen_[0];
-  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
-  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  nz  = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
   return nz;
 }
 
@@ -324,12 +324,12 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
 
 extern void VP8EncDspInitSSE41(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
-  VP8CollectHistogram = CollectHistogram;
-  VP8EncQuantizeBlock = QuantizeBlock;
-  VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
-  VP8TDisto4x4 = Disto4x4;
-  VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram_SSE41;
+  VP8EncQuantizeBlock = QuantizeBlock_SSE41;
+  VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
+  VP8TDisto4x4 = Disto4x4_SSE41;
+  VP8TDisto16x16 = Disto16x16_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/filters.c b/thirdparty/libwebp/src/dsp/filters.c
index 65f34aad1f..ca5f877da7 100644
--- a/thirdparty/libwebp/dsp/filters.c
+++ b/thirdparty/libwebp/src/dsp/filters.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@@ -20,16 +20,17 @@
 // Helpful macro.
 
 # define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
+  assert((in) != NULL);                                                        \
+  assert((out) != NULL);                                                       \
   assert(width > 0);                                                           \
   assert(height > 0);                                                          \
   assert(stride >= width);                                                     \
   assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
   (void)height;  // Silence unused warning.
 
-static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
-                                    uint8_t* dst, int length, int inverse) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
+                                      uint8_t* dst, int length, int inverse) {
   int i;
   if (inverse) {
     for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
@@ -41,10 +42,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
 //------------------------------------------------------------------------------
 // Horizontal filter.
 
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           int inverse, uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
+                                             int width, int height, int stride,
+                                             int row, int num_rows,
+                                             int inverse, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -56,7 +57,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     row = 1;
     preds += stride;
     in += stride;
@@ -66,8 +67,8 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   // Filter line-by-line.
   while (row < last_row) {
     // Leftmost pixel is predicted from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in, preds - stride, out, 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     ++row;
     preds += stride;
     in += stride;
@@ -78,10 +79,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.
 
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -94,7 +95,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     row = 1;
     in += stride;
     out += stride;
@@ -105,26 +106,28 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 
   // Filter line-by-line.
   while (row < last_row) {
-    PredictLine(in, preds, out, width, inverse);
+    PredictLine_C(in, preds, out, width, inverse);
     ++row;
     preds += stride;
     in += stride;
     out += stride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 // Gradient filter.
 
-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
   const int g = a + b - c;
   return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }
 
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+#if !WEBP_NEON_OMIT_C_CODE
+static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -136,7 +139,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
     row = 1;
     preds += stride;
     in += stride;
@@ -147,11 +150,11 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   while (row < last_row) {
     int w;
     // leftmost pixel: predict from above.
-    PredictLine(in, preds - stride, out, 1, inverse);
+    PredictLine_C(in, preds - stride, out, 1, inverse);
     for (w = 1; w < width; ++w) {
-      const int pred = GradientPredictor(preds[w - 1],
-                                         preds[w - stride],
-                                         preds[w - stride - 1]);
+      const int pred = GradientPredictor_C(preds[w - 1],
+                                           preds[w - stride],
+                                           preds[w - stride - 1]);
       out[w] = in[w] + (inverse ? pred : -pred);
     }
     ++row;
@@ -160,32 +163,34 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
     out += stride;
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 #undef SANITY_CHECK
 
 //------------------------------------------------------------------------------
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+#if !WEBP_NEON_OMIT_C_CODE
+static void HorizontalFilter_C(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
+                       filtered_data);
 }
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+static void VerticalFilter_C(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
 }
 
-
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+static void GradientFilter_C(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
 }
-
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 //------------------------------------------------------------------------------
 
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                                 uint8_t* out, int width) {
   uint8_t pred = (prev == NULL) ? 0 : prev[0];
   int i;
   for (i = 0; i < width; ++i) {
@@ -194,26 +199,28 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
   }
 }
 
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+#if !WEBP_NEON_OMIT_C_CODE
+static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_C(NULL, in, out, width);
   } else {
     int i;
     for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_C(NULL, in, out, width);
   } else {
     uint8_t top = prev[0], top_left = top, left = top;
     int i;
     for (i = 0; i < width; ++i) {
       top = prev[i];  // need to read this first, in case prev==out
-      left = in[i] + GradientPredictor(left, top, top_left);
+      left = in[i] + GradientPredictor_C(left, top, top_left);
       top_left = top;
       out[i] = left;
     }
@@ -238,14 +245,18 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
   if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
 
   WebPUnfilters[WEBP_FILTER_NONE] = NULL;
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
+#endif
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
 
   WebPFilters[WEBP_FILTER_NONE] = NULL;
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
+#endif
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -253,11 +264,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
       VP8FiltersInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8FiltersInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       VP8FiltersInitMIPSdspR2();
@@ -269,5 +275,20 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8FiltersInitNEON();
+  }
+#endif
+
+  assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
+  assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
+  assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
+
   filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/filters_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
index 1d82e3c2e1..9382b12823 100644
--- a/thirdparty/libwebp/dsp/filters_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "../dsp/dsp.h"
+#include "src/dsp/dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
@@ -101,8 +101,8 @@
     );                                                                         \
   } while (0)
 
-static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
-                                    int length) {
+static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
+                                              int length) {
   DO_PREDICT_LINE(src, dst, length, 0);
 }
 
@@ -192,10 +192,11 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
     }                                                                          \
   } while (0)
 
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
+                                                     int width, int height,
+                                                     int stride,
+                                                     int row, int num_rows,
+                                                     uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -207,7 +208,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
     row = 1;
     preds += stride;
     in += stride;
@@ -219,9 +220,11 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
+                                       int width, int height,
+                                       int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                               filtered_data);
 }
 
 //------------------------------------------------------------------------------
@@ -237,9 +240,11 @@ static void HorizontalFilter(const uint8_t* data, int width, int height,
     }                                                                          \
   } while (0)
 
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
+                                                   int width, int height,
+                                                   int stride,
+                                                   int row, int num_rows,
+                                                   uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -252,7 +257,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -266,15 +271,16 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
 }
 
 //------------------------------------------------------------------------------
 // Gradient filter.
 
-static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
   int temp0;
   __asm__ volatile (
     "addu             %[temp0],   %[a],       %[b]        \n\t"
@@ -293,9 +299,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
       int w;                                                                   \
       PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
       for (w = 1; w < width; ++w) {                                            \
-        const int pred = GradientPredictor(PREDS[w - 1],                       \
-                                           PREDS[w - stride],                  \
-                                           PREDS[w - stride - 1]);             \
+        const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1],             \
+                                                     PREDS[w - stride],        \
+                                                     PREDS[w - stride - 1]);   \
         out[w] = in[w] OPERATION pred;                                         \
       }                                                                        \
       ++row;                                                                   \
@@ -304,9 +310,9 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
     }                                                                          \
   } while (0)
 
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
+                                       int width, int height, int stride,
+                                       int row, int num_rows, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
@@ -318,7 +324,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1);
+    PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
     row = 1;
     preds += stride;
     in += stride;
@@ -330,38 +336,39 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
 }
 #undef FILTER_LINE_BY_LINE
 
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
+                                     int stride, uint8_t* filtered_data) {
+  DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
+                             filtered_data);
 }
 
 //------------------------------------------------------------------------------
 
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                         uint8_t* out, int width) {
  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
  DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
 }
 
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
   } else {
     DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
   }
 }
 
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
+                                       uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
   } else {
     uint8_t top = prev[0], top_left = top, left = top;
     int i;
     for (i = 0; i < width; ++i) {
       top = prev[i];  // need to read this first, in case prev==dst
-      left = in[i] + GradientPredictor(left, top, top_left);
+      left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
       top_left = top;
       out[i] = left;
     }
@@ -379,13 +386,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
 
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/filters_msa.c b/thirdparty/libwebp/src/dsp/filters_msa.c
index 4b8922d0bc..14c437d141 100644
--- a/thirdparty/libwebp/dsp/filters_msa.c
+++ b/thirdparty/libwebp/src/dsp/filters_msa.c
@@ -11,11 +11,11 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./msa_macro.h"
+#include "src/dsp/msa_macro.h"
 
 #include <assert.h>
 
@@ -66,8 +66,8 @@ static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
 //------------------------------------------------------------------------------
 // Horrizontal filter
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
+static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
+                                 int stride, uint8_t* filtered_data) {
   const uint8_t* preds = data;
   const uint8_t* in = data;
   uint8_t* out = filtered_data;
@@ -129,8 +129,8 @@ static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
 }
 
 
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
+static void GradientFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
   const uint8_t* in = data;
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
@@ -157,8 +157,8 @@ static void GradientFilter(const uint8_t* data, int width, int height,
 //------------------------------------------------------------------------------
 // Vertical filter
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
+static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
   const uint8_t* in = data;
   const uint8_t* preds = data;
   uint8_t* out = filtered_data;
@@ -190,9 +190,9 @@ static void VerticalFilter(const uint8_t* data, int width, int height,
 extern void VP8FiltersInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/filters_neon.c b/thirdparty/libwebp/src/dsp/filters_neon.c
index 4d6e50cc76..3e6a578ea7 100644
--- a/thirdparty/libwebp/dsp/filters_neon.c
+++ b/thirdparty/libwebp/src/dsp/filters_neon.c
@@ -11,12 +11,12 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <assert.h>
-#include "./neon.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 // Helpful macros.
@@ -134,7 +134,7 @@ static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
 }
 
 static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+                                int stride, uint8_t* filtered_data) {
   DoVerticalFilter_NEON(data, width, height, stride, 0, height,
                         filtered_data);
 }
@@ -196,7 +196,7 @@ static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
 }
 
 static void GradientFilter_NEON(const uint8_t* data, int width, int height,
-                               int stride, uint8_t* filtered_data) {
+                                int stride, uint8_t* filtered_data) {
   DoGradientFilter_NEON(data, width, height, stride, 0, height,
                         filtered_data);
 }
@@ -251,9 +251,11 @@ static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
 // GradientUnfilter_NEON is correct but slower than the C-version,
 // at least on ARM64. For armv7, it's a wash.
 // So best is to disable it for now, but keep the idea around...
-// #define USE_GRADIENT_UNFILTER
+#if !defined(USE_GRADIENT_UNFILTER)
+#define USE_GRADIENT_UNFILTER 0   // ALTERNATE_CODE
+#endif
 
-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
 #define GRAD_PROCESS_LANE(L)  do {                                             \
   const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
   const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
@@ -292,7 +294,7 @@ static void GradientPredictInverse_NEON(const uint8_t* const in,
 #undef GRAD_PROCESS_LANE
 
 static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
-                                 uint8_t* out, int width) {
+                                  uint8_t* out, int width) {
   if (prev == NULL) {
     HorizontalUnfilter_NEON(NULL, in, out, width);
   } else {
@@ -311,7 +313,7 @@ extern void VP8FiltersInitNEON(void);
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
-#if defined(USE_GRADIENT_UNFILTER)
+#if (USE_GRADIENT_UNFILTER == 1)
   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
 #endif
 
diff --git a/thirdparty/libwebp/dsp/filters_sse2.c b/thirdparty/libwebp/src/dsp/filters_sse2.c
index 67f77999e6..5a18895676 100644
--- a/thirdparty/libwebp/dsp/filters_sse2.c
+++ b/thirdparty/libwebp/src/dsp/filters_sse2.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
@@ -24,16 +24,16 @@
 // Helpful macro.
 
 # define SANITY_CHECK(in, out)                                                 \
-  assert(in != NULL);                                                          \
-  assert(out != NULL);                                                         \
+  assert((in) != NULL);                                                        \
+  assert((out) != NULL);                                                       \
   assert(width > 0);                                                           \
   assert(height > 0);                                                          \
   assert(stride >= width);                                                     \
   assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
   (void)height;  // Silence unused warning.
 
-static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
-                           uint8_t* dst, int length) {
+static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
+                                uint8_t* dst, int length) {
   int i;
   const int max_pos = length & ~31;
   assert(length >= 0);
@@ -51,7 +51,7 @@ static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
 }
 
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
+static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
   int i;
   const int max_pos = length & ~31;
   assert(length >= 0);
@@ -71,10 +71,11 @@ static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
 //------------------------------------------------------------------------------
 // Horizontal filter.
 
-static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
-                                           int width, int height, int stride,
-                                           int row, int num_rows,
-                                           uint8_t* out) {
+static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
+                                                int width, int height,
+                                                int stride,
+                                                int row, int num_rows,
+                                                uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
@@ -84,7 +85,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -94,7 +95,7 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   while (row < last_row) {
     // Leftmost pixel is predicted from above.
     out[0] = in[0] - in[-stride];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     ++row;
     in += stride;
     out += stride;
@@ -104,9 +105,10 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Vertical filter.
 
-static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows, uint8_t* out) {
+static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
@@ -117,7 +119,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -125,7 +127,7 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 
   // Filter line-by-line.
   while (row < last_row) {
-    PredictLineTop(in, in - stride, out, width);
+    PredictLineTop_SSE2(in, in - stride, out, width);
     ++row;
     in += stride;
     out += stride;
@@ -135,14 +137,14 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
 //------------------------------------------------------------------------------
 // Gradient filter.
 
-static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
+static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
   const int g = a + b - c;
   return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }
 
-static void GradientPredictDirect(const uint8_t* const row,
-                                  const uint8_t* const top,
-                                  uint8_t* const out, int length) {
+static void GradientPredictDirect_SSE2(const uint8_t* const row,
+                                       const uint8_t* const top,
+                                       uint8_t* const out, int length) {
   const int max_pos = length & ~7;
   int i;
   const __m128i zero = _mm_setzero_si128();
@@ -161,14 +163,14 @@ static void GradientPredictDirect(const uint8_t* const row,
     _mm_storel_epi64((__m128i*)(out + i), H);
   }
   for (; i < length; ++i) {
-    out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+    out[i] = row[i] - GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
   }
 }
 
-static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
-                                         int width, int height, int stride,
-                                         int row, int num_rows,
-                                         uint8_t* out) {
+static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
@@ -178,7 +180,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1);
+    PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -187,7 +189,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // Filter line-by-line.
   while (row < last_row) {
     out[0] = in[0] - in[-stride];
-    GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
+    GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
     ++row;
     in += stride;
     out += stride;
@@ -198,26 +200,27 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
 
 //------------------------------------------------------------------------------
 
-static void HorizontalFilter(const uint8_t* data, int width, int height,
-                             int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
+static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
+                                  int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
+                          filtered_data);
 }
 
-static void VerticalFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
+static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
+                                int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
 }
 
-static void GradientFilter(const uint8_t* data, int width, int height,
-                           int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
+static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
+                                int stride, uint8_t* filtered_data) {
+  DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
 }
 
 //------------------------------------------------------------------------------
 // Inverse transforms
 
-static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
-                               uint8_t* out, int width) {
+static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                    uint8_t* out, int width) {
   int i;
   __m128i last;
   out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
@@ -238,10 +241,10 @@ static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
   for (; i < width; ++i) out[i] = in[i] + out[i - 1];
 }
 
-static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_SSE2(NULL, in, out, width);
   } else {
     int i;
     const int max_pos = width & ~31;
@@ -260,9 +263,9 @@ static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
   }
 }
 
-static void GradientPredictInverse(const uint8_t* const in,
-                                   const uint8_t* const top,
-                                   uint8_t* const row, int length) {
+static void GradientPredictInverse_SSE2(const uint8_t* const in,
+                                        const uint8_t* const top,
+                                        uint8_t* const row, int length) {
   if (length > 0) {
     int i;
     const int max_pos = length & ~7;
@@ -293,18 +296,18 @@ static void GradientPredictInverse(const uint8_t* const in,
       _mm_storel_epi64((__m128i*)&row[i], out);
     }
     for (; i < length; ++i) {
-      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+      row[i] = in[i] + GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
     }
   }
 }
 
-static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
-                             uint8_t* out, int width) {
+static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
   if (prev == NULL) {
-    HorizontalUnfilter(NULL, in, out, width);
+    HorizontalUnfilter_SSE2(NULL, in, out, width);
   } else {
     out[0] = in[0] + prev[0];  // predict from above
-    GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
+    GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
   }
 }
 
@@ -314,13 +317,13 @@ static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
 extern void VP8FiltersInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
-  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
-  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
-  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
 
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/lossless.c b/thirdparty/libwebp/src/dsp/lossless.c
index 20d18f6ecd..83f553d9ad 100644
--- a/thirdparty/libwebp/dsp/lossless.c
+++ b/thirdparty/libwebp/src/dsp/lossless.c
@@ -13,14 +13,15 @@
 //          Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li_dec.h"
-#include "../utils/endian_inl_utils.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 
 #define MAX_DIFF_COST (1e30f)
 
@@ -80,8 +81,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 
-// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
-#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
+// inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION <= 0x409
 # define LOCAL_INLINE __attribute__ ((noinline))
 #else
 # define LOCAL_INLINE WEBP_INLINE
@@ -107,69 +109,69 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors
 
-static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
   (void)top;
   (void)left;
   return ARGB_BLACK;
 }
-static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
   (void)top;
   return left;
 }
-static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[0];
 }
-static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[1];
 }
-static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
   (void)left;
   return top[-1];
 }
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average3(left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[0]);
   return pred;
 }
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
-GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
-static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
-                          int num_pixels, uint32_t* out) {
+GENERATE_PREDICTOR_ADD(Predictor0_C, PredictorAdd0_C)
+static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
   int i;
   uint32_t left = out[-1];
   for (i = 0; i < num_pixels; ++i) {
@@ -177,29 +179,29 @@ static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
   }
   (void)upper;
 }
-GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
-GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
-GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
-GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
-GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
-GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
-GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
-GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
-GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
-GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
-GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
-GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
+GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
 
 //------------------------------------------------------------------------------
 
 // Inverse prediction.
-static void PredictorInverseTransform(const VP8LTransform* const transform,
-                                      int y_start, int y_end,
-                                      const uint32_t* in, uint32_t* out) {
+static void PredictorInverseTransform_C(const VP8LTransform* const transform,
+                                        int y_start, int y_end,
+                                        const uint32_t* in, uint32_t* out) {
   const int width = transform->xsize_;
   if (y_start == 0) {  // First Row follows the L (mode=1) mode.
-    PredictorAdd0(in, NULL, 1, out);
-    PredictorAdd1(in + 1, NULL, width - 1, out + 1);
+    PredictorAdd0_C(in, NULL, 1, out);
+    PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
     in += width;
     out += width;
     ++y_start;
@@ -217,7 +219,7 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
       const uint32_t* pred_mode_src = pred_mode_base;
       int x = 1;
       // First pixel follows the T (mode=2) mode.
-      PredictorAdd2(in, out - width, 1, out);
+      PredictorAdd2_C(in, out - width, 1, out);
       // .. the rest:
       while (x < width) {
         const VP8LPredictorAddSubFunc pred_func =
@@ -272,8 +274,8 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
     const uint32_t argb = src[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    int new_red = red;
-    int new_blue = argb;
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
@@ -284,9 +286,9 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
 }
 
 // Color space inverse transform.
-static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
-                                       int y_start, int y_end,
-                                       const uint32_t* src, uint32_t* dst) {
+static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
+                                         int y_start, int y_end,
+                                         const uint32_t* src, uint32_t* dst) {
   const int width = transform->xsize_;
   const int tile_width = 1 << transform->bits_;
   const int mask = tile_width - 1;
@@ -362,10 +364,10 @@ STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform,               \
   }                                                                            \
 }
 
-COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
-                    VP8GetARGBIndex, VP8GetARGBValue)
-COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
-                    8b, VP8GetAlphaIndex, VP8GetAlphaValue)
+COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
+                    uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
+COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
+                    uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
 
 #undef COLOR_INDEX_INVERSE
 
@@ -380,7 +382,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
       VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
       break;
     case PREDICTOR_TRANSFORM:
-      PredictorInverseTransform(transform, row_start, row_end, in, out);
+      PredictorInverseTransform_C(transform, row_start, row_end, in, out);
       if (row_end != transform->ysize_) {
         // The last predicted row in this iteration will be the top-pred row
         // for the first row in next iteration.
@@ -389,7 +391,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
       }
       break;
     case CROSS_COLOR_TRANSFORM:
-      ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
+      ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
       break;
     case COLOR_INDEXING_TRANSFORM:
       if (in == out && transform->bits_ > 0) {
@@ -403,9 +405,9 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
             VP8LSubSampleSize(transform->xsize_, transform->bits_);
         uint32_t* const src = out + out_stride - in_stride;
         memmove(src, out, in_stride * sizeof(*src));
-        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
+        ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
       } else {
-        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+        ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
       }
       break;
   }
@@ -452,7 +454,7 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
     const uint32_t argb = *src++;
     const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
     const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     *dst++ = ba;
     *dst++ = rg;
 #else
@@ -469,7 +471,7 @@ void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
     const uint32_t argb = *src++;
     const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
     const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     *dst++ = gb;
     *dst++ = rg;
 #else
@@ -496,22 +498,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
     const uint32_t* const src_end = src + num_pixels;
     while (src < src_end) {
       const uint32_t argb = *src++;
-
-#if !defined(WORDS_BIGENDIAN)
-#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
       WebPUint32ToMem(dst, BSwap32(argb));
-#else  // WEBP_REFERENCE_IMPLEMENTATION
-      dst[0] = (argb >> 24) & 0xff;
-      dst[1] = (argb >> 16) & 0xff;
-      dst[2] = (argb >>  8) & 0xff;
-      dst[3] = (argb >>  0) & 0xff;
-#endif
-#else  // WORDS_BIGENDIAN
-      dst[0] = (argb >>  0) & 0xff;
-      dst[1] = (argb >>  8) & 0xff;
-      dst[2] = (argb >> 16) & 0xff;
-      dst[3] = (argb >> 24) & 0xff;
-#endif
       dst += sizeof(argb);
     }
   } else {
@@ -593,23 +580,23 @@ extern void VP8LDspInitMSA(void);
 static volatile VP8CPUInfo lossless_last_cpuinfo_used =
     (VP8CPUInfo)&lossless_last_cpuinfo_used;
 
-#define COPY_PREDICTOR_ARRAY(IN, OUT) do {              \
-  (OUT)[0] = IN##0;                                     \
-  (OUT)[1] = IN##1;                                     \
-  (OUT)[2] = IN##2;                                     \
-  (OUT)[3] = IN##3;                                     \
-  (OUT)[4] = IN##4;                                     \
-  (OUT)[5] = IN##5;                                     \
-  (OUT)[6] = IN##6;                                     \
-  (OUT)[7] = IN##7;                                     \
-  (OUT)[8] = IN##8;                                     \
-  (OUT)[9] = IN##9;                                     \
-  (OUT)[10] = IN##10;                                   \
-  (OUT)[11] = IN##11;                                   \
-  (OUT)[12] = IN##12;                                   \
-  (OUT)[13] = IN##13;                                   \
-  (OUT)[14] = IN##0; /* <- padding security sentinels*/ \
-  (OUT)[15] = IN##0;                                    \
+#define COPY_PREDICTOR_ARRAY(IN, OUT) do {                \
+  (OUT)[0] = IN##0_C;                                     \
+  (OUT)[1] = IN##1_C;                                     \
+  (OUT)[2] = IN##2_C;                                     \
+  (OUT)[3] = IN##3_C;                                     \
+  (OUT)[4] = IN##4_C;                                     \
+  (OUT)[5] = IN##5_C;                                     \
+  (OUT)[6] = IN##6_C;                                     \
+  (OUT)[7] = IN##7_C;                                     \
+  (OUT)[8] = IN##8_C;                                     \
+  (OUT)[9] = IN##9_C;                                     \
+  (OUT)[10] = IN##10_C;                                   \
+  (OUT)[11] = IN##11_C;                                   \
+  (OUT)[12] = IN##12_C;                                   \
+  (OUT)[13] = IN##13_C;                                   \
+  (OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
+  (OUT)[15] = IN##0_C;                                    \
 } while (0);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
@@ -620,18 +607,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
+#if !WEBP_NEON_OMIT_C_CODE
   VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
 
   VP8LTransformColorInverse = VP8LTransformColorInverse_C;
 
-  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
   VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+#endif
+
   VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
   VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
-  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
 
-  VP8LMapColor32b = MapARGB;
-  VP8LMapColor8b = MapAlpha;
+  VP8LMapColor32b = MapARGB_C;
+  VP8LMapColor8b = MapAlpha_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -640,11 +630,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
       VP8LDspInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8LDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       VP8LDspInitMIPSdspR2();
@@ -656,6 +641,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LDspInitNEON();
+  }
+#endif
+
+  assert(VP8LAddGreenToBlueAndRed != NULL);
+  assert(VP8LTransformColorInverse != NULL);
+  assert(VP8LConvertBGRAToRGBA != NULL);
+  assert(VP8LConvertBGRAToRGB != NULL);
+  assert(VP8LConvertBGRAToBGR != NULL);
+  assert(VP8LConvertBGRAToRGBA4444 != NULL);
+  assert(VP8LConvertBGRAToRGB565 != NULL);
+  assert(VP8LMapColor32b != NULL);
+  assert(VP8LMapColor8b != NULL);
+
   lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #undef COPY_PREDICTOR_ARRAY
diff --git a/thirdparty/libwebp/dsp/lossless.h b/thirdparty/libwebp/src/dsp/lossless.h
index 352a54e509..a99dbda686 100644
--- a/thirdparty/libwebp/dsp/lossless.h
+++ b/thirdparty/libwebp/src/dsp/lossless.h
@@ -15,18 +15,18 @@
 #ifndef WEBP_DSP_LOSSLESS_H_
 #define WEBP_DSP_LOSSLESS_H_
 
-#include "../webp/types.h"
-#include "../webp/decode.h"
+#include "src/webp/types.h"
+#include "src/webp/decode.h"
 
-#include "../enc/histogram_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/histogram_enc.h"
+#include "src/utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../enc/delta_palettization_enc.h"
+#include "src/enc/delta_palettization_enc.h"
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
 //------------------------------------------------------------------------------
@@ -124,7 +124,7 @@ void VP8LDspInit(void);
 typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
 extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
-                                       uint32_t* const dst, int num_pixels);
+                                       uint32_t* dst, int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
     const uint32_t* argb, int stride,
diff --git a/thirdparty/libwebp/dsp/lossless_common.h b/thirdparty/libwebp/src/dsp/lossless_common.h
index c40f711208..a2648d1737 100644
--- a/thirdparty/libwebp/dsp/lossless_common.h
+++ b/thirdparty/libwebp/src/dsp/lossless_common.h
@@ -16,9 +16,9 @@
 #ifndef WEBP_DSP_LOSSLESS_COMMON_H_
 #define WEBP_DSP_LOSSLESS_COMMON_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
-#include "../utils/utils.h"
+#include "src/utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -93,14 +93,6 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1))) {  // zero or a power of two.
-    return log_floor;
-  }
-  return log_floor + 1;
-}
-
 // Splitting of distance and length codes into prefixes and
 // extra bits. The prefixes are encoded with an entropy code
 // while the extra bits are stored just as normal bits.
diff --git a/thirdparty/libwebp/dsp/lossless_enc.c b/thirdparty/libwebp/src/dsp/lossless_enc.c
index 4e46fbab8b..92ca3c0542 100644
--- a/thirdparty/libwebp/dsp/lossless_enc.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc.c
@@ -13,15 +13,16 @@
 //          Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li_dec.h"
-#include "../utils/endian_inl_utils.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
-#include "./yuv.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/yuv.h"
 
 // lookup table for small values of log2(int)
 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
@@ -325,7 +326,7 @@ const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
   112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
 };
 
-static float FastSLog2Slow(uint32_t v) {
+static float FastSLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
@@ -351,7 +352,7 @@ static float FastSLog2Slow(uint32_t v) {
   }
 }
 
-static float FastLog2Slow(uint32_t v) {
+static float FastLog2Slow_C(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     int log_cnt = 0;
@@ -380,7 +381,7 @@ static float FastLog2Slow(uint32_t v) {
 // Methods to calculate Entropy (Shannon).
 
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
   int i;
   double retval = 0.;
   int sumX = 0, sumXY = 0;
@@ -453,9 +454,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
-static void GetEntropyUnrefined(const uint32_t X[], int length,
-                                VP8LBitEntropy* const bit_entropy,
-                                VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_C(const uint32_t X[], int length,
+                                  VP8LBitEntropy* const bit_entropy,
+                                  VP8LStreaks* const stats) {
   int i;
   int i_prev = 0;
   uint32_t x_prev = X[0];
@@ -474,10 +475,11 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
-                                        int length,
-                                        VP8LBitEntropy* const bit_entropy,
-                                        VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
+                                          const uint32_t Y[],
+                                          int length,
+                                          VP8LBitEntropy* const bit_entropy,
+                                          VP8LStreaks* const stats) {
   int i = 1;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
@@ -520,8 +522,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
     const uint32_t argb = data[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    int new_red = red;
-    int new_blue = argb;
+    int new_red = red & 0xff;
+    int new_blue = argb & 0xff;
     new_red -= ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue -= ColorTransformDelta(m->green_to_blue_, green);
@@ -577,8 +579,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
 
 //------------------------------------------------------------------------------
 
-static int VectorMismatch(const uint32_t* const array1,
-                          const uint32_t* const array2, int length) {
+static int VectorMismatch_C(const uint32_t* const array1,
+                            const uint32_t* const array2, int length) {
   int match_len = 0;
 
   while (match_len < length && array1[match_len] == array2[match_len]) {
@@ -610,15 +612,15 @@ void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
 
 //------------------------------------------------------------------------------
 
-static double ExtraCost(const uint32_t* population, int length) {
+static double ExtraCost_C(const uint32_t* population, int length) {
   int i;
   double cost = 0.;
   for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
   return cost;
 }
 
-static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
-                                int length) {
+static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
+                                  int length) {
   int i;
   double cost = 0.;
   for (i = 2; i < length - 2; ++i) {
@@ -630,9 +632,9 @@ static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
 
 //------------------------------------------------------------------------------
 
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
+static void HistogramAdd_C(const VP8LHistogram* const a,
+                           const VP8LHistogram* const b,
+                           VP8LHistogram* const out) {
   int i;
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
@@ -869,26 +871,28 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
 
   VP8LDspInit();
 
+#if !WEBP_NEON_OMIT_C_CODE
   VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
 
   VP8LTransformColor = VP8LTransformColor_C;
+#endif
 
   VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
   VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
 
-  VP8LFastLog2Slow = FastLog2Slow;
-  VP8LFastSLog2Slow = FastSLog2Slow;
+  VP8LFastLog2Slow = FastLog2Slow_C;
+  VP8LFastSLog2Slow = FastSLog2Slow_C;
 
-  VP8LExtraCost = ExtraCost;
-  VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LCombinedShannonEntropy = CombinedShannonEntropy;
+  VP8LExtraCost = ExtraCost_C;
+  VP8LExtraCostCombined = ExtraCostCombined_C;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
 
-  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
-  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
 
-  VP8LHistogramAdd = HistogramAdd;
+  VP8LHistogramAdd = HistogramAdd_C;
 
-  VP8LVectorMismatch = VectorMismatch;
+  VP8LVectorMismatch = VectorMismatch_C;
   VP8LBundleColorMap = VP8LBundleColorMap_C;
 
   VP8LPredictorsSub[0] = PredictorSub0_C;
@@ -937,11 +941,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
 #endif
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      VP8LEncDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8LEncDspInitMIPS32();
@@ -958,6 +957,61 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    VP8LEncDspInitNEON();
+  }
+#endif
+
+  assert(VP8LSubtractGreenFromBlueAndRed != NULL);
+  assert(VP8LTransformColor != NULL);
+  assert(VP8LCollectColorBlueTransforms != NULL);
+  assert(VP8LCollectColorRedTransforms != NULL);
+  assert(VP8LFastLog2Slow != NULL);
+  assert(VP8LFastSLog2Slow != NULL);
+  assert(VP8LExtraCost != NULL);
+  assert(VP8LExtraCostCombined != NULL);
+  assert(VP8LCombinedShannonEntropy != NULL);
+  assert(VP8LGetEntropyUnrefined != NULL);
+  assert(VP8LGetCombinedEntropyUnrefined != NULL);
+  assert(VP8LHistogramAdd != NULL);
+  assert(VP8LVectorMismatch != NULL);
+  assert(VP8LBundleColorMap != NULL);
+  assert(VP8LPredictorsSub[0] != NULL);
+  assert(VP8LPredictorsSub[1] != NULL);
+  assert(VP8LPredictorsSub[2] != NULL);
+  assert(VP8LPredictorsSub[3] != NULL);
+  assert(VP8LPredictorsSub[4] != NULL);
+  assert(VP8LPredictorsSub[5] != NULL);
+  assert(VP8LPredictorsSub[6] != NULL);
+  assert(VP8LPredictorsSub[7] != NULL);
+  assert(VP8LPredictorsSub[8] != NULL);
+  assert(VP8LPredictorsSub[9] != NULL);
+  assert(VP8LPredictorsSub[10] != NULL);
+  assert(VP8LPredictorsSub[11] != NULL);
+  assert(VP8LPredictorsSub[12] != NULL);
+  assert(VP8LPredictorsSub[13] != NULL);
+  assert(VP8LPredictorsSub[14] != NULL);
+  assert(VP8LPredictorsSub[15] != NULL);
+  assert(VP8LPredictorsSub_C[0] != NULL);
+  assert(VP8LPredictorsSub_C[1] != NULL);
+  assert(VP8LPredictorsSub_C[2] != NULL);
+  assert(VP8LPredictorsSub_C[3] != NULL);
+  assert(VP8LPredictorsSub_C[4] != NULL);
+  assert(VP8LPredictorsSub_C[5] != NULL);
+  assert(VP8LPredictorsSub_C[6] != NULL);
+  assert(VP8LPredictorsSub_C[7] != NULL);
+  assert(VP8LPredictorsSub_C[8] != NULL);
+  assert(VP8LPredictorsSub_C[9] != NULL);
+  assert(VP8LPredictorsSub_C[10] != NULL);
+  assert(VP8LPredictorsSub_C[11] != NULL);
+  assert(VP8LPredictorsSub_C[12] != NULL);
+  assert(VP8LPredictorsSub_C[13] != NULL);
+  assert(VP8LPredictorsSub_C[14] != NULL);
+  assert(VP8LPredictorsSub_C[15] != NULL);
+
   lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
diff --git a/thirdparty/libwebp/dsp/lossless_enc_mips32.c b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 4186b9f50d..e7b58f4e8c 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_mips32.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -12,9 +12,9 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 
 #if defined(WEBP_USE_MIPS32)
 
@@ -23,7 +23,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-static float FastSLog2Slow(uint32_t v) {
+static float FastSLog2Slow_MIPS32(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     uint32_t log_cnt, y, correction;
@@ -59,7 +59,7 @@ static float FastSLog2Slow(uint32_t v) {
   }
 }
 
-static float FastLog2Slow(uint32_t v) {
+static float FastLog2Slow_MIPS32(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
     uint32_t log_cnt, y;
@@ -104,7 +104,7 @@ static float FastLog2Slow(uint32_t v) {
 //     pop += 2;
 //   }
 //   return (double)cost;
-static double ExtraCost(const uint32_t* const population, int length) {
+static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
   int i, temp0, temp1;
   const uint32_t* pop = &population[4];
   const uint32_t* const LoopEnd = &population[length];
@@ -149,8 +149,8 @@ static double ExtraCost(const uint32_t* const population, int length) {
 //     pY += 2;
 //   }
 //   return (double)cost;
-static double ExtraCostCombined(const uint32_t* const X,
-                                const uint32_t* const Y, int length) {
+static double ExtraCostCombined_MIPS32(const uint32_t* const X,
+                                       const uint32_t* const Y, int length) {
   int i, temp0, temp1, temp2, temp3;
   const uint32_t* pX = &X[4];
   const uint32_t* pY = &Y[4];
@@ -241,9 +241,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
-static void GetEntropyUnrefined(const uint32_t X[], int length,
-                                VP8LBitEntropy* const bit_entropy,
-                                VP8LStreaks* const stats) {
+static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
+                                       VP8LBitEntropy* const bit_entropy,
+                                       VP8LStreaks* const stats) {
   int i;
   int i_prev = 0;
   uint32_t x_prev = X[0];
@@ -262,26 +262,27 @@ static void GetEntropyUnrefined(const uint32_t X[], int length,
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
-                                        int length,
-                                        VP8LBitEntropy* const bit_entropy,
-                                        VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
+                                               const uint32_t Y[],
+                                               int length,
+                                               VP8LBitEntropy* const entropy,
+                                               VP8LStreaks* const stats) {
   int i = 1;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
 
   memset(stats, 0, sizeof(*stats));
-  VP8LBitEntropyInit(bit_entropy);
+  VP8LBitEntropyInit(entropy);
 
   for (i = 1; i < length; ++i) {
     const uint32_t xy = X[i] + Y[i];
     if (xy != xy_prev) {
-      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
     }
   }
-  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
 
-  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+  entropy->entropy += VP8LFastSLog2(entropy->sum);
 }
 
 #define ASM_START                                       \
@@ -374,9 +375,9 @@ static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
   }                                                     \
 } while (0)
 
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
+static void HistogramAdd_MIPS32(const VP8LHistogram* const a,
+                                const VP8LHistogram* const b,
+                                VP8LHistogram* const out) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
                              - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
@@ -415,13 +416,13 @@ static void HistogramAdd(const VP8LHistogram* const a,
 extern void VP8LEncDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
-  VP8LFastSLog2Slow = FastSLog2Slow;
-  VP8LFastLog2Slow = FastLog2Slow;
-  VP8LExtraCost = ExtraCost;
-  VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
-  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
-  VP8LHistogramAdd = HistogramAdd;
+  VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
+  VP8LFastLog2Slow = FastLog2Slow_MIPS32;
+  VP8LExtraCost = ExtraCost_MIPS32;
+  VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
+  VP8LHistogramAdd = HistogramAdd_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/lossless_enc_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
index 0abf3c4f36..5855e6ae15 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
@@ -12,14 +12,14 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./lossless.h"
+#include "src/dsp/lossless.h"
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
-                                        int num_pixels) {
+static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
+                                                  int num_pixels) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
   uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
@@ -78,8 +78,8 @@ static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
   return (uint32_t)((int)(color_pred) * color) >> 5;
 }
 
-static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
-                           int num_pixels) {
+static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
+                                     uint32_t* data, int num_pixels) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   uint32_t argb, argb1, new_red, new_red1;
   const uint32_t G_to_R = m->green_to_red_;
@@ -171,10 +171,13 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
 }
 
-static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
-                                       int tile_width, int tile_height,
-                                       int green_to_blue, int red_to_blue,
-                                       int histo[]) {
+static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
+                                                 int stride,
+                                                 int tile_width,
+                                                 int tile_height,
+                                                 int green_to_blue,
+                                                 int red_to_blue,
+                                                 int histo[]) {
   const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
   const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
   const uint32_t mask = 0xff00ffu;
@@ -222,9 +225,12 @@ static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
   return (new_red & 0xff);
 }
 
-static void CollectColorRedTransforms(const uint32_t* argb, int stride,
-                                      int tile_width, int tile_height,
-                                      int green_to_red, int histo[]) {
+static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
+                                                int stride,
+                                                int tile_width,
+                                                int tile_height,
+                                                int green_to_red,
+                                                int histo[]) {
   const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
   while (tile_height-- > 0) {
     int x;
@@ -262,10 +268,10 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
 extern void VP8LEncDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
-  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
-  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
+  VP8LTransformColor = TransformColor_MIPSdspR2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/lossless_enc_msa.c b/thirdparty/libwebp/src/dsp/lossless_enc_msa.c
index 2f69ba3bca..600dddfb59 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_msa.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_msa.c
@@ -11,12 +11,12 @@
 //
 // Authors: Prashant Patil (Prashant.Patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./lossless.h"
-#include "./msa_macro.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/msa_macro.h"
 
 #define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do {  \
   v8i16 g0, g1, t0, t1, t2, t3;                                               \
@@ -48,8 +48,8 @@
   dst = VSHF_UB(src, t0, mask1);                                \
 } while (0)
 
-static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
-                           int num_pixels) {
+static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
+                               int num_pixels) {
   v16u8 src0, dst0;
   const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
                                          (m->green_to_red_ << 16));
@@ -94,7 +94,8 @@ static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
   }
 }
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
+                                            int num_pixels) {
   int i;
   uint8_t* ptemp_data = (uint8_t*)argb_data;
   v16u8 src0, dst0, tmp0;
@@ -136,8 +137,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 extern void VP8LEncDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
+  VP8LTransformColor = TransformColor_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/lossless_enc_neon.c b/thirdparty/libwebp/src/dsp/lossless_enc_neon.c
index 4c56f2594b..7c7b73f8b6 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_neon.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_neon.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <arm_neon.h>
 
-#include "./lossless.h"
-#include "./neon.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
@@ -36,8 +36,8 @@ static const uint8_t kGreenShuffle[16] = {
   1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
 };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x16_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x16_t shuffle) {
   return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
                      vtbl1q_u8(argb, vget_high_u8(shuffle)));
 }
@@ -45,14 +45,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 // 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x8_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x8_t shuffle) {
   return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                      vtbl1_u8(vget_high_u8(argb), shuffle));
 }
 #endif  // USE_VTBLQ
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
+                                             int num_pixels) {
   const uint32_t* const end = argb_data + (num_pixels & ~3);
 #ifdef USE_VTBLQ
   const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@@ -61,7 +62,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 #endif
   for (; argb_data < end; argb_data += 4) {
     const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
     vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
   }
   // fallthrough and finish off with plain-C
@@ -71,8 +72,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColor(const VP8LMultipliers* const m,
-                           uint32_t* argb_data, int num_pixels) {
+static void TransformColor_NEON(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
   // sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
   const int16_t rb[8] = {
@@ -102,7 +103,7 @@ static void TransformColor(const VP8LMultipliers* const m,
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
     // 0 g 0 g
-    const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
     // x dr  x db1
     const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
     // r 0   b   0
@@ -132,8 +133,8 @@ static void TransformColor(const VP8LMultipliers* const m,
 extern void VP8LEncDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
+  VP8LTransformColor = TransformColor_NEON;
 }
 
 #else  // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/lossless_enc_sse2.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 8ad85d94d7..1eaf35ca8e 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -11,22 +11,23 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 #include <assert.h>
 #include <emmintrin.h>
-#include "./lossless.h"
-#include "./common_sse2.h"
-#include "./lossless_common.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dsp/lossless_common.h"
 
 // For sign-extended multiplying constants, pre-shifted by 5:
-#define CST_5b(X)  (((int16_t)((uint16_t)X << 8)) >> 5)
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
 
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
+                                             int num_pixels) {
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
@@ -45,8 +46,8 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColor(const VP8LMultipliers* const m,
-                           uint32_t* argb_data, int num_pixels) {
+static void TransformColor_SSE2(const VP8LMultipliers* const m,
+                                uint32_t* argb_data, int num_pixels) {
   const __m128i mults_rb = _mm_set_epi16(
       CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
       CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
@@ -80,10 +81,10 @@ static void TransformColor(const VP8LMultipliers* const m,
 
 //------------------------------------------------------------------------------
 #define SPAN 8
-static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
-                                       int tile_width, int tile_height,
-                                       int green_to_blue, int red_to_blue,
-                                       int histo[]) {
+static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_blue, int red_to_blue,
+                                            int histo[]) {
   const __m128i mults_r = _mm_set_epi16(
       CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
       CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
@@ -131,9 +132,9 @@ static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
   }
 }
 
-static void CollectColorRedTransforms(const uint32_t* argb, int stride,
-                                      int tile_width, int tile_height,
-                                      int green_to_red, int histo[]) {
+static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
+                                           int tile_width, int tile_height,
+                                           int green_to_red, int histo[]) {
   const __m128i mults_g = _mm_set_epi16(
       0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
       0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
@@ -177,8 +178,8 @@ static void CollectColorRedTransforms(const uint32_t* argb, int stride,
 //------------------------------------------------------------------------------
 
 #define LINE_SIZE 16    // 8 or 16
-static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
-                      int size) {
+static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                           int size) {
   int i;
   assert(size % LINE_SIZE == 0);
   for (i = 0; i < size; i += LINE_SIZE) {
@@ -203,7 +204,7 @@ static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
   }
 }
 
-static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
   int i;
   assert(size % LINE_SIZE == 0);
   for (i = 0; i < size; i += LINE_SIZE) {
@@ -231,22 +232,22 @@ static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
 
 // Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
 // that's ok since the histogram values are less than 1<<28 (max picture size).
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
+static void HistogramAdd_SSE2(const VP8LHistogram* const a,
+                              const VP8LHistogram* const b,
+                              VP8LHistogram* const out) {
   int i;
   const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   if (b != out) {
-    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
-    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVector_SSE2(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
   } else {
-    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
-    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->red_, out->red_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVectorEq_SSE2(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
   }
   for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
     out->literal_[i] = a->literal_[i] + b->literal_[i];
@@ -261,9 +262,9 @@ static void HistogramAdd(const VP8LHistogram* const a,
 
 // Checks whether the X or Y contribution is worth computing and adding.
 // Used in loop unrolling.
-#define ANALYZE_X_OR_Y(x_or_y, j)                                   \
-  do {                                                              \
-    if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
+#define ANALYZE_X_OR_Y(x_or_y, j)                                           \
+  do {                                                                      \
+    if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
   } while (0)
 
 // Checks whether the X + Y contribution is worth computing and adding.
@@ -276,7 +277,7 @@ static void HistogramAdd(const VP8LHistogram* const a,
     }                                  \
   } while (0)
 
-static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
   int i;
   double retval = 0.;
   int sumX, sumXY;
@@ -332,8 +333,8 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
 
 //------------------------------------------------------------------------------
 
-static int VectorMismatch(const uint32_t* const array1,
-                          const uint32_t* const array2, int length) {
+static int VectorMismatch_SSE2(const uint32_t* const array1,
+                               const uint32_t* const array2, int length) {
   int match_len;
 
   if (length >= 12) {
@@ -574,8 +575,8 @@ static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
 }
 
 // Predictor11: select.
-static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
-                            __m128i* const out) {
+static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
+                                 __m128i* const out) {
   // We can unpack with any value on the upper 32 bits, provided it's the same
   // on both operands (to that their sum of abs diff is zero). Here we use *A.
   const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
@@ -596,8 +597,8 @@ static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
     const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
     const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
     __m128i pa, pb;
-    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
-    GetSumAbsDiff32(&L, &TL, &pb);   // pb = sum |L-TL|
+    GetSumAbsDiff32_SSE2(&T, &TL, &pa);   // pa = sum |T-TL|
+    GetSumAbsDiff32_SSE2(&L, &TL, &pb);   // pb = sum |L-TL|
     {
       const __m128i mask = _mm_cmpgt_epi32(pb, pa);
       const __m128i A = _mm_and_si128(mask, L);
@@ -677,13 +678,13 @@ static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
 extern void VP8LEncDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
-  VP8LTransformColor = TransformColor;
-  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
-  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
-  VP8LHistogramAdd = HistogramAdd;
-  VP8LCombinedShannonEntropy = CombinedShannonEntropy;
-  VP8LVectorMismatch = VectorMismatch;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
+  VP8LTransformColor = TransformColor_SSE2;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
+  VP8LHistogramAdd = HistogramAdd_SSE2;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+  VP8LVectorMismatch = VectorMismatch_SSE2;
   VP8LBundleColorMap = BundleColorMap_SSE2;
 
   VP8LPredictorsSub[0] = PredictorSub0_SSE2;
diff --git a/thirdparty/libwebp/dsp/lossless_enc_sse41.c b/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 821057ccd4..3526a342d3 100644
--- a/thirdparty/libwebp/dsp/lossless_enc_sse41.c
+++ b/thirdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -11,17 +11,18 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE41)
 #include <assert.h>
 #include <smmintrin.h>
-#include "./lossless.h"
+#include "src/dsp/lossless.h"
 
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
+                                              int num_pixels) {
   int i;
   const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
                                            -1,  5, -1,  5, -1, 1, -1, 1);
@@ -43,7 +44,7 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
 extern void VP8LEncDspInitSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/thirdparty/libwebp/dsp/lossless_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 2984ce8df7..9888854d57 100644
--- a/thirdparty/libwebp/dsp/lossless_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -12,12 +12,12 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 
 #define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
 static void FUNC_NAME(const TYPE* src,                                         \
@@ -86,8 +86,8 @@ static void FUNC_NAME(const TYPE* src,                                         \
   }                                                                            \
 }
 
-MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
-MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
+MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
+MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
 
 #undef MAP_COLOR_FUNCS
 
@@ -188,48 +188,52 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   return Average3(left, top[0], top[1]);
 }
 
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   return Average2(left, top[-1]);
 }
 
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   return Average2(left, top[0]);
 }
 
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   (void)left;
   return Average2(top[-1], top[0]);
 }
 
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
   (void)left;
   return Average2(top[0], top[1]);
 }
 
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return Average4(left, top[-1], top[0], top[1]);
 }
 
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return Select(top[0], left, top[-1]);
 }
 
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return ClampedAddSubtractFull(left, top[0], top[-1]);
 }
 
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_MIPSdspR2(uint32_t left,
+                                      const uint32_t* const top) {
   return ClampedAddSubtractHalf(left, top[0], top[-1]);
 }
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
+                                           uint32_t* dst) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -285,9 +289,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
   );
 }
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
+                                            const uint32_t* src, int num_pixels,
+                                            uint32_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   uint32_t argb, argb1, new_red;
   const uint32_t G_to_R = m->green_to_red_;
@@ -356,8 +360,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
   if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -408,8 +412,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
+                                        int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -458,8 +462,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
+                                            int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -492,7 +496,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     "ins            %[temp3],    %[temp5],          16,   4    \n\t"
     "addiu          %[src],      %[src],            16         \n\t"
     "precr.qb.ph    %[temp3],    %[temp3],          %[temp2]   \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "usw            %[temp1],    0(%[dst])                     \n\t"
     "usw            %[temp3],    4(%[dst])                     \n\t"
 #else
@@ -514,7 +518,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     "ins            %[temp0],    %[temp5],          16,   4    \n\t"
     "addiu          %[src],      %[src],            4          \n\t"
     "precr.qb.ph    %[temp0],    %[temp0],          %[temp0]   \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "ush            %[temp0],    0(%[dst])                     \n\t"
 #else
     "wsbh           %[temp0],    %[temp0]                      \n\t"
@@ -532,8 +536,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToRGB565(const uint32_t* src,
-                                int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
+                                          int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -570,7 +574,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     "ins            %[temp2],    %[temp3],          0,    5    \n\t"
     "addiu          %[src],      %[src],            16         \n\t"
     "append         %[temp2],    %[temp1],          16         \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "usw            %[temp0],    0(%[dst])                     \n\t"
     "usw            %[temp2],    4(%[dst])                     \n\t"
 #else
@@ -592,7 +596,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     "ins            %[temp4],    %[temp5],          0,    11   \n\t"
     "addiu          %[src],      %[src],            4          \n\t"
     "ins            %[temp4],    %[temp0],          0,    5    \n\t"
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     "ush            %[temp4],    0(%[dst])                     \n\t"
 #else
     "wsbh           %[temp4],    %[temp4]                      \n\t"
@@ -610,8 +614,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
   );
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
   int temp0, temp1, temp2, temp3;
   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
   const uint32_t* const p_loop2_end = src + num_pixels;
@@ -662,24 +666,27 @@ static void ConvertBGRAToBGR(const uint32_t* src,
 extern void VP8LDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
-  VP8LMapColor32b = MapARGB;
-  VP8LMapColor8b = MapAlpha;
-  VP8LPredictors[5] = Predictor5;
-  VP8LPredictors[6] = Predictor6;
-  VP8LPredictors[7] = Predictor7;
-  VP8LPredictors[8] = Predictor8;
-  VP8LPredictors[9] = Predictor9;
-  VP8LPredictors[10] = Predictor10;
-  VP8LPredictors[11] = Predictor11;
-  VP8LPredictors[12] = Predictor12;
-  VP8LPredictors[13] = Predictor13;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
-  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LMapColor32b = MapARGB_MIPSdspR2;
+  VP8LMapColor8b = MapAlpha_MIPSdspR2;
+
+  VP8LPredictors[5] = Predictor5_MIPSdspR2;
+  VP8LPredictors[6] = Predictor6_MIPSdspR2;
+  VP8LPredictors[7] = Predictor7_MIPSdspR2;
+  VP8LPredictors[8] = Predictor8_MIPSdspR2;
+  VP8LPredictors[9] = Predictor9_MIPSdspR2;
+  VP8LPredictors[10] = Predictor10_MIPSdspR2;
+  VP8LPredictors[11] = Predictor11_MIPSdspR2;
+  VP8LPredictors[12] = Predictor12_MIPSdspR2;
+  VP8LPredictors[13] = Predictor13_MIPSdspR2;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
+  VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
+
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/lossless_msa.c b/thirdparty/libwebp/src/dsp/lossless_msa.c
index f6dd5649ac..9f5472078d 100644
--- a/thirdparty/libwebp/dsp/lossless_msa.c
+++ b/thirdparty/libwebp/src/dsp/lossless_msa.c
@@ -11,12 +11,12 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./lossless.h"
-#include "./msa_macro.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/msa_macro.h"
 
 //------------------------------------------------------------------------------
 // Colorspace conversion functions
@@ -43,7 +43,7 @@
 
 #define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
   uint64_t pix_d;                                          \
-  v16u8 src0, src1, src2, dst0, dst1;                      \
+  v16u8 src0, src1, src2 = { 0 }, dst0, dst1;              \
   LD_UB2(psrc, 16, src0, src1);                            \
   VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
   ST_UB(dst0, pdst);                                       \
@@ -109,8 +109,8 @@
   dst = VSHF_UB(src, t0, mask1);                                        \
 } while (0)
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   int i;
   const uint8_t* ptemp_src = (const uint8_t*)src;
   uint8_t* ptemp_dst = (uint8_t*)dst;
@@ -150,8 +150,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
   const uint8_t* ptemp_src = (const uint8_t*)src;
   uint8_t* ptemp_dst = (uint8_t*)dst;
   const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
@@ -197,8 +197,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_MSA(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
   const uint8_t* ptemp_src = (const uint8_t*)src;
   uint8_t* ptemp_dst = (uint8_t*)dst;
   const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
@@ -244,8 +244,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
   }
 }
 
-static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
+                                     uint32_t* dst) {
   int i;
   const uint8_t* in = (const uint8_t*)src;
   uint8_t* out = (uint8_t*)dst;
@@ -286,9 +286,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
   }
 }
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
+                                      const uint32_t* src, int num_pixels,
+                                      uint32_t* dst) {
   v16u8 src0, dst0;
   const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
                                          (m->green_to_red_ << 16));
@@ -341,11 +341,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
 extern void VP8LDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
+
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
+  VP8LTransformColorInverse = TransformColorInverse_MSA;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/lossless_neon.c b/thirdparty/libwebp/src/dsp/lossless_neon.c
index 1145d5fad0..76a1b6f873 100644
--- a/thirdparty/libwebp/dsp/lossless_neon.c
+++ b/thirdparty/libwebp/src/dsp/lossless_neon.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <arm_neon.h>
 
-#include "./lossless.h"
-#include "./neon.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/neon.h"
 
 //------------------------------------------------------------------------------
 // Colorspace conversion functions
@@ -26,8 +26,8 @@
 #if !defined(WORK_AROUND_GCC)
 // gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
 // gcc-4.8.x at least.
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
     uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -41,8 +41,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -53,8 +53,8 @@ static void ConvertBGRAToBGR(const uint32_t* src,
   VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst);  // left-overs
 }
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~15);
   for (; src < end; src += 16) {
     const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
@@ -71,8 +71,8 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 
 static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~1);
   const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
   for (; src < end; src += 2) {
@@ -89,8 +89,8 @@ static const uint8_t kBGRShuffle[3][8] = {
   { 21, 22, 24, 25, 26, 28, 29, 30 }
 };
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
   const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
@@ -116,8 +116,8 @@ static const uint8_t kRGBShuffle[3][8] = {
   { 21, 20, 26, 25, 24, 30, 29, 28 }
 };
 
-static void ConvertBGRAToRGB(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB_NEON(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~7);
   const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
   const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
@@ -139,7 +139,6 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 
 #endif   // !WORK_AROUND_GCC
 
-
 //------------------------------------------------------------------------------
 // Predictor Transform
 
@@ -506,8 +505,8 @@ static const uint8_t kGreenShuffle[16] = {
   1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
 };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x16_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x16_t shuffle) {
   return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
                      vtbl1q_u8(argb, vget_high_u8(shuffle)));
 }
@@ -515,15 +514,15 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 // 255 = byte will be zeroed
 static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
 
-static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
-                                             const uint8x8_t shuffle) {
+static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
+                                                  const uint8x8_t shuffle) {
   return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
                      vtbl1_u8(vget_high_u8(argb), shuffle));
 }
 #endif  // USE_VTBLQ
 
-static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
+                                      uint32_t* dst) {
   const uint32_t* const end = src + (num_pixels & ~3);
 #ifdef USE_VTBLQ
   const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@@ -532,7 +531,7 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
 #endif
   for (; src < end; src += 4, dst += 4) {
     const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
-    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
     vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
   }
   // fallthrough and finish off with plain-C
@@ -542,9 +541,9 @@ static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* const src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
+                                       const uint32_t* const src,
+                                       int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
   const int16_t rb[8] = {
@@ -575,7 +574,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
     const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
     // 0 g 0 g
-    const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+    const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
     // x dr  x db1
     const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
     // x r'  x   b'
@@ -627,12 +626,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
   VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
   VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
 
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
 
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
+  VP8LTransformColorInverse = TransformColorInverse_NEON;
 }
 
 #else  // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/lossless_sse2.c b/thirdparty/libwebp/src/dsp/lossless_sse2.c
index 15aae93869..653b466cd6 100644
--- a/thirdparty/libwebp/dsp/lossless_sse2.c
+++ b/thirdparty/libwebp/src/dsp/lossless_sse2.c
@@ -11,21 +11,22 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
-#include "./common_sse2.h"
-#include "./lossless.h"
-#include "./lossless_common.h"
+#include "src/dsp/common_sse2.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
 #include <assert.h>
 #include <emmintrin.h>
 
 //------------------------------------------------------------------------------
 // Predictor Transform
 
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
+static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
+                                                        uint32_t c1,
+                                                        uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@@ -37,8 +38,9 @@ static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
   return output;
 }
 
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
+                                                        uint32_t c1,
+                                                        uint32_t c2) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
@@ -55,7 +57,7 @@ static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
   return output;
 }
 
-static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
   int pa_minus_pb;
   const __m128i zero = _mm_setzero_si128();
   const __m128i A0 = _mm_cvtsi32_si128(a);
@@ -88,8 +90,9 @@ static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
   *avg = _mm_sub_epi8(avg1, one);
 }
 
-static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
-                                        __m128i* const avg) {
+static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
+                                             const uint32_t a1,
+                                             __m128i* const avg) {
   // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
   const __m128i ones = _mm_set1_epi8(1);
   const __m128i A0 = _mm_cvtsi32_si128(a0);
@@ -99,7 +102,7 @@ static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
   *avg = _mm_sub_epi8(avg1, one);
 }
 
-static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
@@ -107,15 +110,16 @@ static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
   return _mm_srli_epi16(sum, 1);
 }
 
-static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
   __m128i output;
-  Average2_uint32(a0, a1, &output);
+  Average2_uint32_SSE2(a0, a1, &output);
   return _mm_cvtsi128_si32(output);
 }
 
-static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
+                                          uint32_t a2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i avg1 = Average2_uint32_16(a0, a2);
+  const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
   const __m128i sum = _mm_add_epi16(avg1, A1);
   const __m128i avg2 = _mm_srli_epi16(sum, 1);
@@ -124,10 +128,10 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
   return output;
 }
 
-static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
-                                     uint32_t a2, uint32_t a3) {
-  const __m128i avg1 = Average2_uint32_16(a0, a1);
-  const __m128i avg2 = Average2_uint32_16(a2, a3);
+static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
+                                          uint32_t a2, uint32_t a3) {
+  const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
+  const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
   const __m128i sum = _mm_add_epi16(avg2, avg1);
   const __m128i avg3 = _mm_srli_epi16(sum, 1);
   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
@@ -136,41 +140,41 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
 }
 
 static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
+  const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
   return pred;
 }
 static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
+  const uint32_t pred = Average2_SSE2(left, top[-1]);
   return pred;
 }
 static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
+  const uint32_t pred = Average2_SSE2(left, top[0]);
   return pred;
 }
 static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[-1], top[0]);
+  const uint32_t pred = Average2_SSE2(top[-1], top[0]);
   (void)left;
   return pred;
 }
 static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[0], top[1]);
+  const uint32_t pred = Average2_SSE2(top[0], top[1]);
   (void)left;
   return pred;
 }
 static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
   return pred;
 }
 static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+  const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
   return pred;
 }
 static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
   return pred;
 }
 static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
   return pred;
 }
 
@@ -272,9 +276,24 @@ GENERATE_PREDICTOR_2(9, upper[i + 1])
 #undef GENERATE_PREDICTOR_2
 
 // Predictor10: average of (average of (L,TL), average of (T, TR)).
+#define DO_PRED10(OUT) do {               \
+  __m128i avgLTL, avg;                    \
+  Average2_m128i(&L, &TL, &avgLTL);       \
+  Average2_m128i(&avgTTR, &avgLTL, &avg); \
+  L = _mm_add_epi8(avg, src);             \
+  out[i + (OUT)] = _mm_cvtsi128_si32(L);  \
+} while (0)
+
+#define DO_PRED10_SHIFT do {                                  \
+  /* Rotate the pre-computed values for the next iteration.*/ \
+  avgTTR = _mm_srli_si128(avgTTR, 4);                         \
+  TL = _mm_srli_si128(TL, 4);                                 \
+  src = _mm_srli_si128(src, 4);                               \
+} while (0)
+
 static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
-  int i, j;
+  int i;
   __m128i L = _mm_cvtsi32_si128(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
@@ -283,79 +302,90 @@ static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
     const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
     __m128i avgTTR;
     Average2_m128i(&T, &TR, &avgTTR);
-    for (j = 0; j < 4; ++j) {
-      __m128i avgLTL, avg;
-      Average2_m128i(&L, &TL, &avgLTL);
-      Average2_m128i(&avgTTR, &avgLTL, &avg);
-      L = _mm_add_epi8(avg, src);
-      out[i + j] = _mm_cvtsi128_si32(L);
-      // Rotate the pre-computed values for the next iteration.
-      avgTTR = _mm_srli_si128(avgTTR, 4);
-      TL = _mm_srli_si128(TL, 4);
-      src = _mm_srli_si128(src, 4);
-    }
+    DO_PRED10(0);
+    DO_PRED10_SHIFT;
+    DO_PRED10(1);
+    DO_PRED10_SHIFT;
+    DO_PRED10(2);
+    DO_PRED10_SHIFT;
+    DO_PRED10(3);
   }
   if (i != num_pixels) {
     VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
   }
 }
+#undef DO_PRED10
+#undef DO_PRED10_SHIFT
 
 // Predictor11: select.
-static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
-                            __m128i* const out) {
-  // We can unpack with any value on the upper 32 bits, provided it's the same
-  // on both operands (to that their sum of abs diff is zero). Here we use *A.
-  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
-  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
-  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
-  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
-  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
-  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
-  *out = _mm_packs_epi32(s_lo, s_hi);
-}
+#define DO_PRED11(OUT) do {                                            \
+  const __m128i L_lo = _mm_unpacklo_epi32(L, T);                       \
+  const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);                     \
+  const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
+  const __m128i mask = _mm_cmpgt_epi32(pb, pa);                        \
+  const __m128i A = _mm_and_si128(mask, L);                            \
+  const __m128i B = _mm_andnot_si128(mask, T);                         \
+  const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
+  L = _mm_add_epi8(src, pred);                                         \
+  out[i + (OUT)] = _mm_cvtsi128_si32(L);                               \
+} while (0)
+
+#define DO_PRED11_SHIFT do {                                \
+  /* Shift the pre-computed value for the next iteration.*/ \
+  T = _mm_srli_si128(T, 4);                                 \
+  TL = _mm_srli_si128(TL, 4);                               \
+  src = _mm_srli_si128(src, 4);                             \
+  pa = _mm_srli_si128(pa, 4);                               \
+} while (0)
 
 static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                 int num_pixels, uint32_t* out) {
-  int i, j;
+  int i;
+  __m128i pa;
   __m128i L = _mm_cvtsi32_si128(out[-1]);
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
     __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
     __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
-    __m128i pa;
-    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
-    for (j = 0; j < 4; ++j) {
-      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
-      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
-      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
-      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
-      const __m128i A = _mm_and_si128(mask, L);
-      const __m128i B = _mm_andnot_si128(mask, T);
-      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
-      L = _mm_add_epi8(src, pred);
-      out[i + j] = _mm_cvtsi128_si32(L);
-      // Shift the pre-computed value for the next iteration.
-      T = _mm_srli_si128(T, 4);
-      TL = _mm_srli_si128(TL, 4);
-      src = _mm_srli_si128(src, 4);
-      pa = _mm_srli_si128(pa, 4);
+    {
+      // We can unpack with any value on the upper 32 bits, provided it's the
+      // same on both operands (so that their sum of abs diff is zero). Here we
+      // use T.
+      const __m128i T_lo = _mm_unpacklo_epi32(T, T);
+      const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
+      const __m128i T_hi = _mm_unpackhi_epi32(T, T);
+      const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
+      const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
+      const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
+      pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
     }
+    DO_PRED11(0);
+    DO_PRED11_SHIFT;
+    DO_PRED11(1);
+    DO_PRED11_SHIFT;
+    DO_PRED11(2);
+    DO_PRED11_SHIFT;
+    DO_PRED11(3);
   }
   if (i != num_pixels) {
     VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
   }
 }
+#undef DO_PRED11
+#undef DO_PRED11_SHIFT
 
 // Predictor12: ClampedAddSubtractFull.
-#define DO_PRED12(DIFF, LANE, OUT)                          \
-do {                                                        \
-  const __m128i all = _mm_add_epi16(L, (DIFF));             \
-  const __m128i alls = _mm_packus_epi16(all, all);          \
-  const __m128i res = _mm_add_epi8(src, alls);              \
-  out[i + (OUT)] = _mm_cvtsi128_si32(res);                  \
-  L = _mm_unpacklo_epi8(res, zero);                         \
+#define DO_PRED12(DIFF, LANE, OUT) do {            \
+  const __m128i all = _mm_add_epi16(L, (DIFF));    \
+  const __m128i alls = _mm_packus_epi16(all, all); \
+  const __m128i res = _mm_add_epi8(src, alls);     \
+  out[i + (OUT)] = _mm_cvtsi128_si32(res);         \
+  L = _mm_unpacklo_epi8(res, zero);                \
+} while (0)
+
+#define DO_PRED12_SHIFT(DIFF, LANE) do {                    \
   /* Shift the pre-computed value for the next iteration.*/ \
-  if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8);        \
+  if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8);      \
   src = _mm_srli_si128(src, 4);                             \
 } while (0)
 
@@ -377,8 +407,11 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
     __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
     __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
     DO_PRED12(diff_lo, 0, 0);
+    DO_PRED12_SHIFT(diff_lo, 0);
     DO_PRED12(diff_lo, 1, 1);
+    DO_PRED12_SHIFT(diff_lo, 1);
     DO_PRED12(diff_hi, 0, 2);
+    DO_PRED12_SHIFT(diff_hi, 0);
     DO_PRED12(diff_hi, 1, 3);
   }
   if (i != num_pixels) {
@@ -386,6 +419,7 @@ static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
   }
 }
 #undef DO_PRED12
+#undef DO_PRED12_SHIFT
 
 // Due to averages with integers, values cannot be accumulated in parallel for
 // predictors 13.
@@ -394,8 +428,8 @@ GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
-                                 uint32_t* dst) {
+static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
+                                      uint32_t* dst) {
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
     const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
@@ -414,9 +448,9 @@ static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
 //------------------------------------------------------------------------------
 // Color Transform
 
-static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  const uint32_t* const src, int num_pixels,
-                                  uint32_t* dst) {
+static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
+                                       const uint32_t* const src,
+                                       int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
   const __m128i mults_rb = _mm_set_epi16(
@@ -454,8 +488,8 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
 //------------------------------------------------------------------------------
 // Color-space conversion functions
 
-static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
-                             uint8_t* dst) {
+static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
+                                  uint8_t* dst) {
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
 
@@ -490,27 +524,26 @@ static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
   }
 }
 
-static void ConvertBGRAToRGBA(const uint32_t* src,
-                              int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
+                                   int num_pixels, uint8_t* dst) {
+  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
   const __m128i* in = (const __m128i*)src;
   __m128i* out = (__m128i*)dst;
   while (num_pixels >= 8) {
-    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
-    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
-    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
-    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
-    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);   // b0b2b4b6g0g2g4g6...
-    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);   // b1b3b5b7g1g3g5g7...
-    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);   // b0...b7 | g0...g7
-    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);   // r0...r7 | a0...a7
-    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);  // g0...g7 | a0...a7
-    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);  // r0...r7 | b0...b7
-    const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0);   // r0g0r1g1 ... r6g6r7g7
-    const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0);   // b0a0b1a1 ... b6a6b7a7
-    const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0);  // rgba0|rgba1...
-    const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0);  // rgba4|rgba5...
-    _mm_storeu_si128(out++, rgba0);
-    _mm_storeu_si128(out++, rgba4);
+    const __m128i A1 = _mm_loadu_si128(in++);
+    const __m128i A2 = _mm_loadu_si128(in++);
+    const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
+    const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
+    const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
+    const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
+    const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i F1 = _mm_or_si128(E1, C1);
+    const __m128i F2 = _mm_or_si128(E2, C2);
+    _mm_storeu_si128(out++, F1);
+    _mm_storeu_si128(out++, F2);
     num_pixels -= 8;
   }
   // left-overs
@@ -519,8 +552,8 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGBA4444(const uint32_t* src,
-                                  int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
+                                       int num_pixels, uint8_t* dst) {
   const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
   const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
   const __m128i* in = (const __m128i*)src;
@@ -541,7 +574,7 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
     const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
     const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
 #else
     const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
@@ -555,8 +588,8 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToRGB565(const uint32_t* src,
-                                int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
+                                     int num_pixels, uint8_t* dst) {
   const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
   const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
   const __m128i mask_0x07 = _mm_set1_epi8(0x07);
@@ -582,7 +615,7 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
     const __m128i b1 = _mm_srli_epi16(b0, 3);
     const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
 #else
     const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
@@ -596,8 +629,8 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
   }
 }
 
-static void ConvertBGRAToBGR(const uint32_t* src,
-                             int num_pixels, uint8_t* dst) {
+static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
   const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
   const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
   const __m128i* in = (const __m128i*)src;
@@ -660,14 +693,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
   VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
   VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
 
-  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-  VP8LTransformColorInverse = TransformColorInverse;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
+  VP8LTransformColorInverse = TransformColorInverse_SSE2;
 
-  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
-  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
-  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
-  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
-  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/mips_macro.h b/thirdparty/libwebp/src/dsp/mips_macro.h
index 44aba9b71d..44aba9b71d 100644
--- a/thirdparty/libwebp/dsp/mips_macro.h
+++ b/thirdparty/libwebp/src/dsp/mips_macro.h
diff --git a/thirdparty/libwebp/dsp/msa_macro.h b/thirdparty/libwebp/src/dsp/msa_macro.h
index d0e5f45e01..dfacda6ccd 100644
--- a/thirdparty/libwebp/dsp/msa_macro.h
+++ b/thirdparty/libwebp/src/dsp/msa_macro.h
@@ -22,6 +22,7 @@
 #endif
 
 #ifdef CLANG_BUILD
+  #define ALPHAVAL  (-1)
   #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
   #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
   #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
@@ -32,6 +33,7 @@
   #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
   #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
 #else
+  #define ALPHAVAL  (0xff)
   #define ADDVI_H(a, b)  (a + b)
   #define ADDVI_W(a, b)  (a + b)
   #define SRAI_B(a, b)  (a >> b)
diff --git a/thirdparty/libwebp/dsp/neon.h b/thirdparty/libwebp/src/dsp/neon.h
index 3b548a6855..aa1dea1301 100644
--- a/thirdparty/libwebp/dsp/neon.h
+++ b/thirdparty/libwebp/src/dsp/neon.h
@@ -14,11 +14,12 @@
 
 #include <arm_neon.h>
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 // Right now, some intrinsics functions seem slower, so we disable them
-// everywhere except aarch64 where the inline assembly is incompatible.
-#if defined(__aarch64__)
+// everywhere except newer clang/gcc or aarch64 where the inline assembly is
+// incompatible.
+#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif
 
@@ -43,11 +44,11 @@
 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
-#if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
+#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
 #define WORK_AROUND_GCC
 #endif
 
-static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
+static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
   uint64x2x2_t row01, row23;
 
   row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
diff --git a/thirdparty/libwebp/dsp/rescaler.c b/thirdparty/libwebp/src/dsp/rescaler.c
index 0f54502352..4b6b7834e5 100644
--- a/thirdparty/libwebp/dsp/rescaler.c
+++ b/thirdparty/libwebp/src/dsp/rescaler.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 
-#include "./dsp.h"
-#include "../utils/rescaler_utils.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
@@ -25,7 +25,8 @@
 //------------------------------------------------------------------------------
 // Row import
 
-void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   int channel;
@@ -56,7 +57,8 @@ void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
   }
 }
 
-void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
+void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   int channel;
@@ -92,7 +94,7 @@ void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
 //------------------------------------------------------------------------------
 // Row export
 
-void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
+void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -123,7 +125,7 @@ void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
   }
 }
 
-void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
+void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -207,11 +209,14 @@ static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
   if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
+#if !defined(WEBP_REDUCE_SIZE)
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
+  WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
+#endif
 
-  WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
-  WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
-  WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
-  WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
+  WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
+  WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -219,11 +224,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
       WebPRescalerDspInitSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPRescalerDspInitNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       WebPRescalerDspInitMIPS32();
@@ -240,5 +240,18 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPRescalerDspInitNEON();
+  }
+#endif
+
+  assert(WebPRescalerExportRowExpand != NULL);
+  assert(WebPRescalerExportRowShrink != NULL);
+  assert(WebPRescalerImportRowExpand != NULL);
+  assert(WebPRescalerImportRowShrink != NULL);
+#endif   // WEBP_REDUCE_SIZE
   rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/rescaler_mips32.c b/thirdparty/libwebp/src/dsp/rescaler_mips32.c
index e09ad5d19f..542f7e5970 100644
--- a/thirdparty/libwebp/dsp/rescaler_mips32.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_mips32.c
@@ -11,17 +11,18 @@
 //
 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_MIPS32)
+#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
 
 #include <assert.h>
-#include "../utils/rescaler_utils.h"
+#include "src/utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Row import
 
-static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
+static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int fx_scale = wrk->fx_scale;
@@ -80,7 +81,8 @@ static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
   }
 }
 
-static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
+static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
+                                   const uint8_t* src) {
   const int x_stride = wrk->num_channels;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   const int x_add = wrk->x_add;
@@ -144,7 +146,7 @@ static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
 //------------------------------------------------------------------------------
 // Row export
 
-static void ExportRowExpand(WebPRescaler* const wrk) {
+static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -207,7 +209,7 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
   }
 }
 
-static void ExportRowShrink(WebPRescaler* const wrk) {
+static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
@@ -278,10 +280,10 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
-  WebPRescalerImportRowExpand = ImportRowExpand;
-  WebPRescalerImportRowShrink = ImportRowShrink;
-  WebPRescalerExportRowExpand = ExportRowExpand;
-  WebPRescalerExportRowShrink = ExportRowShrink;
+  WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
+  WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
+  WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/rescaler_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
index 2308d64544..b78aac15e6 100644
--- a/thirdparty/libwebp/dsp/rescaler_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
@@ -11,12 +11,12 @@
 //
 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_MIPS_DSP_R2)
+#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
 
 #include <assert.h>
-#include "../utils/rescaler_utils.h"
+#include "src/utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -24,7 +24,7 @@
 //------------------------------------------------------------------------------
 // Row export
 
-static void ExportRowShrink(WebPRescaler* const wrk) {
+static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
   int i;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
   uint8_t* dst = wrk->dst;
@@ -162,7 +162,7 @@ static void ExportRowShrink(WebPRescaler* const wrk) {
   }
 }
 
-static void ExportRowExpand(WebPRescaler* const wrk) {
+static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   int i;
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
@@ -303,8 +303,8 @@ static void ExportRowExpand(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
-  WebPRescalerExportRowExpand = ExportRowExpand;
-  WebPRescalerExportRowShrink = ExportRowShrink;
+  WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
+  WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/rescaler_msa.c b/thirdparty/libwebp/src/dsp/rescaler_msa.c
index 2c10e55d8c..f3bc99f1cd 100644
--- a/thirdparty/libwebp/dsp/rescaler_msa.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_msa.c
@@ -11,14 +11,14 @@
 //
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_MSA)
+#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
 
 #include <assert.h>
 
-#include "../utils/rescaler_utils.h"
-#include "./msa_macro.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/dsp/msa_macro.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -246,7 +246,7 @@ static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
   }
 }
 
-static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -411,7 +411,7 @@ static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
   }
 }
 
-static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
   uint8_t* dst = wrk->dst;
   rescaler_t* irow = wrk->irow;
   const int x_out_max = wrk->dst_width * wrk->num_channels;
@@ -433,8 +433,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
-  WebPRescalerExportRowExpand = RescalerExportRowExpand;
-  WebPRescalerExportRowShrink = RescalerExportRowShrink;
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
 }
 
 #else     // !WEBP_USE_MSA
diff --git a/thirdparty/libwebp/dsp/rescaler_neon.c b/thirdparty/libwebp/src/dsp/rescaler_neon.c
index b2dd8f30cc..3eff9fbaf4 100644
--- a/thirdparty/libwebp/dsp/rescaler_neon.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_neon.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_NEON)
+#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
 
 #include <arm_neon.h>
 #include <assert.h>
-#include "./neon.h"
-#include "../utils/rescaler_utils.h"
+#include "src/dsp/neon.h"
+#include "src/utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
@@ -41,9 +41,9 @@
 #error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
 #endif
 
-static uint32x4_t Interpolate(const rescaler_t* const frow,
-                              const rescaler_t* const irow,
-                              uint32_t A, uint32_t B) {
+static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
+                                   const rescaler_t* const irow,
+                                   uint32_t A, uint32_t B) {
   LOAD_32x4(frow, A0);
   LOAD_32x4(irow, B0);
   const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
@@ -56,7 +56,7 @@ static uint32x4_t Interpolate(const rescaler_t* const frow,
   return E;
 }
 
-static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -91,9 +91,9 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
     const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
     for (x_out = 0; x_out < max_span; x_out += 8) {
       const uint32x4_t C0 =
-          Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
+          Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
       const uint32x4_t C1 =
-          Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
+          Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
       const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
       const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
       const uint16x4_t E0 = vmovn_u32(D0);
@@ -112,7 +112,7 @@ static void RescalerExportRowExpand(WebPRescaler* const wrk) {
   }
 }
 
-static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -175,8 +175,8 @@ static void RescalerExportRowShrink(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
-  WebPRescalerExportRowExpand = RescalerExportRowExpand;
-  WebPRescalerExportRowShrink = RescalerExportRowShrink;
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
 }
 
 #else     // !WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/rescaler_sse2.c b/thirdparty/libwebp/src/dsp/rescaler_sse2.c
index 8271c22e05..f93b204fe1 100644
--- a/thirdparty/libwebp/dsp/rescaler_sse2.c
+++ b/thirdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
-#if defined(WEBP_USE_SSE2)
+#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE)
 #include <emmintrin.h>
 
 #include <assert.h>
-#include "../utils/rescaler_utils.h"
-#include "../utils/utils.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
@@ -27,7 +27,7 @@
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
 
 // input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
-static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
+static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
   const __m128i B = _mm_unpacklo_epi8(A, zero);              // A0B0C0D0E0F0G0H0
@@ -36,14 +36,14 @@ static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
 }
 
 // input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
-static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
+static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
   *out = _mm_unpacklo_epi8(A, zero);
 }
 
-static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
-                                        const uint8_t* src) {
+static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
+                                         const uint8_t* src) {
   rescaler_t* frow = wrk->frow;
   const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
   const int x_add = wrk->x_add;
@@ -54,10 +54,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
   assert(wrk->x_expand);
   if (wrk->num_channels == 4) {
     if (wrk->src_width < 2) {
-      WebPRescalerImportRowExpandC(wrk, src);
+      WebPRescalerImportRowExpand_C(wrk, src);
       return;
     }
-    LoadTwoPixels(src, &cur_pixels);
+    LoadTwoPixels_SSE2(src, &cur_pixels);
     src += 4;
     while (1) {
       const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
@@ -67,7 +67,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
       if (frow >= frow_end) break;
       accum -= wrk->x_sub;
       if (accum < 0) {
-        LoadTwoPixels(src, &cur_pixels);
+        LoadTwoPixels_SSE2(src, &cur_pixels);
         src += 4;
         accum += x_add;
       }
@@ -76,10 +76,10 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
     int left;
     const uint8_t* const src_limit = src + wrk->src_width - 8;
     if (wrk->src_width < 8) {
-      WebPRescalerImportRowExpandC(wrk, src);
+      WebPRescalerImportRowExpand_C(wrk, src);
       return;
     }
-    LoadHeightPixels(src, &cur_pixels);
+    LoadHeightPixels_SSE2(src, &cur_pixels);
     src += 7;
     left = 7;
     while (1) {
@@ -94,7 +94,7 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
         if (--left) {
           cur_pixels = _mm_srli_si128(cur_pixels, 2);
         } else if (src <= src_limit) {
-          LoadHeightPixels(src, &cur_pixels);
+          LoadHeightPixels_SSE2(src, &cur_pixels);
           src += 7;
           left = 7;
         } else {   // tail
@@ -110,8 +110,8 @@ static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
   assert(accum == 0);
 }
 
-static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
-                                        const uint8_t* src) {
+static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
+                                         const uint8_t* src) {
   const int x_sub = wrk->x_sub;
   int accum = 0;
   const __m128i zero = _mm_setzero_si128();
@@ -123,7 +123,7 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
   const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
 
   if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
-    WebPRescalerImportRowShrinkC(wrk, src);
+    WebPRescalerImportRowShrink_C(wrk, src);
     return;
   }
   assert(!WebPRescalerInputDone(wrk));
@@ -169,12 +169,12 @@ static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
 // Row export
 
 // load *src as epi64, multiply by mult and store result in [out0 ... out3]
-static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
-                                            const __m128i* const mult,
-                                            __m128i* const out0,
-                                            __m128i* const out1,
-                                            __m128i* const out2,
-                                            __m128i* const out3) {
+static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
+                                                 const __m128i* const mult,
+                                                 __m128i* const out0,
+                                                 __m128i* const out1,
+                                                 __m128i* const out2,
+                                                 __m128i* const out3) {
   const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
   const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
   const __m128i A2 = _mm_srli_epi64(A0, 32);
@@ -192,12 +192,12 @@ static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
   }
 }
 
-static WEBP_INLINE void ProcessRow(const __m128i* const A0,
-                                   const __m128i* const A1,
-                                   const __m128i* const A2,
-                                   const __m128i* const A3,
-                                   const __m128i* const mult,
-                                   uint8_t* const dst) {
+static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
+                                        const __m128i* const A1,
+                                        const __m128i* const A2,
+                                        const __m128i* const A3,
+                                        const __m128i* const mult,
+                                        uint8_t* const dst) {
   const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
   const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
   const __m128i B0 = _mm_mul_epu32(*A0, *mult);
@@ -210,7 +210,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
   const __m128i C3 = _mm_add_epi64(B3, rounder);
   const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
   const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
-#if (WEBP_RESCALER_FIX < 32)
+#if (WEBP_RESCALER_RFIX < 32)
   const __m128i D2 =
       _mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
   const __m128i D3 =
@@ -226,7 +226,7 @@ static WEBP_INLINE void ProcessRow(const __m128i* const A0,
   _mm_storel_epi64((__m128i*)dst, G);
 }
 
-static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
+static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -240,8 +240,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
   if (wrk->y_accum == 0) {
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3;
-      LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
-      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+      LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3);
+      ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
       const uint32_t J = frow[x_out];
@@ -257,8 +257,8 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
     const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3, B0, B1, B2, B3;
-      LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
-      LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
+      LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3);
       {
         const __m128i C0 = _mm_add_epi64(A0, B0);
         const __m128i C1 = _mm_add_epi64(A1, B1);
@@ -272,7 +272,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
         const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
         const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
         const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
-        ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
+        ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out);
       }
     }
     for (; x_out < x_out_max; ++x_out) {
@@ -286,7 +286,7 @@ static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
   }
 }
 
-static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
+static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
   int x_out;
   uint8_t* const dst = wrk->dst;
   rescaler_t* const irow = wrk->irow;
@@ -303,8 +303,8 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
     const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3, B0, B1, B2, B3;
-      LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
-      LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
+      LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
       {
         const __m128i C0 = _mm_add_epi64(B0, rounder);
         const __m128i C1 = _mm_add_epi64(B1, rounder);
@@ -324,7 +324,7 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
         const __m128i G1 = _mm_or_si128(D1, F3);
         _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
         _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
-        ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+        ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
       }
     }
     for (; x_out < x_out_max; ++x_out) {
@@ -340,10 +340,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
     const __m128i zero = _mm_setzero_si128();
     for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
       __m128i A0, A1, A2, A3;
-      LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
       _mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
       _mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
-      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+      ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
     }
     for (; x_out < x_out_max; ++x_out) {
       const int v = (int)MULT_FIX(irow[x_out], scale);
@@ -362,10 +362,10 @@ static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
 extern void WebPRescalerDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
-  WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
-  WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
-  WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
-  WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
+  WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2;
+  WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2;
+  WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/src/dsp/ssim.c b/thirdparty/libwebp/src/dsp/ssim.c
new file mode 100644
index 0000000000..dc1b518a33
--- /dev/null
+++ b/thirdparty/libwebp/src/dsp/ssim.c
@@ -0,0 +1,166 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>  // for abs()
+
+#include "src/dsp/dsp.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR
+
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStatsClipped(&stats);
+}
+
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
+    }
+  }
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_DISABLE_STATS)
+static uint32_t AccumulateSSE_C(const uint8_t* src1,
+                                const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_REDUCE_SIZE)
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+#endif
+#if !defined(WEBP_DISABLE_STATS)
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+#endif
+
+extern void VP8SSIMDspInitSSE2(void);
+
+static volatile VP8CPUInfo ssim_last_cpuinfo_used =
+    (VP8CPUInfo)&ssim_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
+  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+#endif
+
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_C;
+#endif
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
+
+  ssim_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/thirdparty/libwebp/src/dsp/ssim_sse2.c b/thirdparty/libwebp/src/dsp/ssim_sse2.c
new file mode 100644
index 0000000000..1dcb0eb0ec
--- /dev/null
+++ b/thirdparty/libwebp/src/dsp/ssim_sse2.c
@@ -0,0 +1,165 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of distortion calculation
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "src/dsp/common_sse2.h"
+
+#if !defined(WEBP_DISABLE_STATS)
+
+// Helper function
+static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
+                                               __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndSquare_SSE2(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndSquare_SSE2(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndSquare_SSE2(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
+#if !defined(WEBP_REDUCE_SIZE)
+
+static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b_SSE2(&xm);
+  stats.ym  = HorizontalAdd16b_SSE2(&ym);
+  stats.xxm = HorizontalAdd32b_SSE2(&xxm);
+  stats.xym = HorizontalAdd32b_SSE2(&xym);
+  stats.yym = HorizontalAdd32b_SSE2(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+#if !defined(WEBP_DISABLE_STATS)
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+#endif
+#if !defined(WEBP_REDUCE_SIZE)
+  VP8SSIMGet = SSIMGet_SSE2;
+#endif
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/thirdparty/libwebp/dsp/upsampling.c b/thirdparty/libwebp/src/dsp/upsampling.c
index 265e722c10..e72626a82a 100644
--- a/thirdparty/libwebp/dsp/upsampling.c
+++ b/thirdparty/libwebp/src/dsp/upsampling.c
@@ -11,8 +11,8 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#include "./dsp.h"
-#include "./yuv.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/yuv.h"
 
 #include <assert.h>
 
@@ -63,17 +63,17 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
       const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
       const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
       FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
-           top_dst + (2 * x - 1) * XSTEP);                                     \
+           top_dst + (2 * x - 1) * (XSTEP));                                   \
       FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
-           top_dst + (2 * x - 0) * XSTEP);                                     \
+           top_dst + (2 * x - 0) * (XSTEP));                                   \
     }                                                                          \
     if (bottom_y != NULL) {                                                    \
       const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
       const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
       FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
-           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+           bottom_dst + (2 * x - 1) * (XSTEP));                                \
       FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
-           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+           bottom_dst + (2 * x + 0) * (XSTEP));                                \
     }                                                                          \
     tl_uv = t_uv;                                                              \
     l_uv = uv;                                                                 \
@@ -82,24 +82,50 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
     {                                                                          \
       const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
       FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
-           top_dst + (len - 1) * XSTEP);                                       \
+           top_dst + (len - 1) * (XSTEP));                                     \
     }                                                                          \
     if (bottom_y != NULL) {                                                    \
       const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
       FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
-           bottom_dst + (len - 1) * XSTEP);                                    \
+           bottom_dst + (len - 1) * (XSTEP));                                  \
     }                                                                          \
   }                                                                            \
 }
 
 // All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
-UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
+#if !WEBP_NEON_OMIT_C_CODE
+UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgbLinePair_C,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair_C,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair_C,  VP8YuvToRgb565,  2)
+#else
+static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
+                              const uint8_t* top_u, const uint8_t* top_v,
+                              const uint8_t* cur_u, const uint8_t* cur_v,
+                              uint8_t* top_dst, uint8_t* bottom_dst, int len) {
+  (void)top_y;
+  (void)bottom_y;
+  (void)top_u;
+  (void)top_v;
+  (void)cur_u;
+  (void)cur_v;
+  (void)top_dst;
+  (void)bottom_dst;
+  (void)len;
+  assert(0);   // COLORSPACE SUPPORT NOT COMPILED
+}
+#define UpsampleArgbLinePair_C EmptyUpsampleFunc
+#define UpsampleRgbLinePair_C EmptyUpsampleFunc
+#define UpsampleBgrLinePair_C EmptyUpsampleFunc
+#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc
+#define UpsampleRgb565LinePair_C EmptyUpsampleFunc
+#endif   // WEBP_REDUCE_CSP
+
+#endif
 
 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@@ -141,7 +167,6 @@ DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
 
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
   WebPInitUpsamplers();
-  VP8YUVInit();
 #ifdef FANCY_UPSAMPLING
   return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
 #else
@@ -158,16 +183,33 @@ extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
 void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,           \
                uint8_t* dst, int len) {                                        \
   int i;                                                                       \
-  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]);         \
 }
 
-YUV444_FUNC(WebPYuv444ToRgbC,      VP8YuvToRgb,  3)
-YUV444_FUNC(WebPYuv444ToBgrC,      VP8YuvToBgr,  3)
-YUV444_FUNC(WebPYuv444ToRgbaC,     VP8YuvToRgba, 4)
-YUV444_FUNC(WebPYuv444ToBgraC,     VP8YuvToBgra, 4)
-YUV444_FUNC(WebPYuv444ToArgbC,     VP8YuvToArgb, 4)
-YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
-YUV444_FUNC(WebPYuv444ToRgb565C,   VP8YuvToRgb565, 2)
+YUV444_FUNC(WebPYuv444ToRgba_C,     VP8YuvToRgba, 4)
+YUV444_FUNC(WebPYuv444ToBgra_C,     VP8YuvToBgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(WebPYuv444ToRgb_C,      VP8YuvToRgb,  3)
+YUV444_FUNC(WebPYuv444ToBgr_C,      VP8YuvToBgr,  3)
+YUV444_FUNC(WebPYuv444ToArgb_C,     VP8YuvToArgb, 4)
+YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
+YUV444_FUNC(WebPYuv444ToRgb565_C,   VP8YuvToRgb565, 2)
+#else
+static void EmptyYuv444Func(const uint8_t* y,
+                            const uint8_t* u, const uint8_t* v,
+                            uint8_t* dst, int len) {
+  (void)y;
+  (void)u;
+  (void)v;
+  (void)dst;
+  (void)len;
+}
+#define WebPYuv444ToRgb_C EmptyYuv444Func
+#define WebPYuv444ToBgr_C EmptyYuv444Func
+#define WebPYuv444ToArgb_C EmptyYuv444Func
+#define WebPYuv444ToRgba4444_C EmptyYuv444Func
+#define WebPYuv444ToRgb565_C EmptyYuv444Func
+#endif   // WEBP_REDUCE_CSP
 
 #undef YUV444_FUNC
 
@@ -182,17 +224,17 @@ static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
   if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
 
-  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgbC;
-  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgbaC;
-  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgrC;
-  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgraC;
-  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgbC;
-  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
-  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565C;
-  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgbaC;
-  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgraC;
-  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgbC;
-  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
+  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
+  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgra_C;
+  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgb_C;
+  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgr_C;
+  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgb_C;
+  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
+  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565_C;
+  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgba_C;
+  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgra_C;
+  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgb_C;
+  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -224,17 +266,19 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
   if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
 
 #ifdef FANCY_UPSAMPLING
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#if !WEBP_NEON_OMIT_C_CODE
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair_C;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair_C;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair_C;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair_C;
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair_C;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair_C;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair_C;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair_C;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair_C;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
+#endif
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -243,11 +287,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
       WebPInitUpsamplersSSE2();
     }
 #endif
-#if defined(WEBP_USE_NEON)
-    if (VP8GetCPUInfo(kNEON)) {
-      WebPInitUpsamplersNEON();
-    }
-#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitUpsamplersMIPSdspR2();
@@ -259,6 +298,26 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
     }
 #endif
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitUpsamplersNEON();
+  }
+#endif
+
+  assert(WebPUpsamplers[MODE_RGBA] != NULL);
+  assert(WebPUpsamplers[MODE_BGRA] != NULL);
+  assert(WebPUpsamplers[MODE_rgbA] != NULL);
+  assert(WebPUpsamplers[MODE_bgrA] != NULL);
+  assert(WebPUpsamplers[MODE_RGB] != NULL);
+  assert(WebPUpsamplers[MODE_BGR] != NULL);
+  assert(WebPUpsamplers[MODE_ARGB] != NULL);
+  assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
+  assert(WebPUpsamplers[MODE_RGB_565] != NULL);
+  assert(WebPUpsamplers[MODE_Argb] != NULL);
+  assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
+
 #endif  // FANCY_UPSAMPLING
   upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/upsampling_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
index ed2eb74825..10d499d771 100644
--- a/thirdparty/libwebp/dsp/upsampling_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
@@ -12,14 +12,12 @@
 // Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
 //            Djordje Pesut  (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include <assert.h>
-#include "./yuv.h"
-
-#if !defined(WEBP_YUV_USE_TABLE)
+#include "src/dsp/yuv.h"
 
 #define YUV_TO_RGB(Y, U, V, R, G, B) do {                                      \
     const int t1 = MultHi(Y, 19077);                                           \
@@ -48,6 +46,7 @@
     );                                                                         \
   } while (0)
 
+#if !defined(WEBP_REDUCE_CSP)
 static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
   int r, g, b;
   YUV_TO_RGB(y, u, v, r, g, b);
@@ -68,7 +67,7 @@ static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
   {
     const int rg = (r & 0xf8) | (g >> 5);
     const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     rgb[0] = gb;
     rgb[1] = rg;
 #else
@@ -84,7 +83,7 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
   {
     const int rg = (r & 0xf0) | (g >> 4);
     const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     argb[0] = ba;
     argb[1] = rg;
 #else
@@ -93,11 +92,12 @@ static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
 #endif
    }
 }
-#endif  // WEBP_YUV_USE_TABLE
+#endif   // WEBP_REDUCE_CSP
 
 //-----------------------------------------------------------------------------
 // Alpha handling variants
 
+#if !defined(WEBP_REDUCE_CSP)
 static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
                                   uint8_t* const argb) {
   int r, g, b;
@@ -107,6 +107,7 @@ static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
   argb[2] = g;
   argb[3] = b;
 }
+#endif   // WEBP_REDUCE_CSP
 static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
                                   uint8_t* const bgra) {
   int r, g, b;
@@ -200,13 +201,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }
 
 // All variants implemented.
-UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
 UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
 UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
 
 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
@@ -217,17 +220,19 @@ UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
 extern void WebPInitUpsamplersMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
   WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
   WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
   WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
@@ -242,13 +247,15 @@ static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
   for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }
 
-YUV444_FUNC(Yuv444ToRgb,      YuvToRgb,      3)
-YUV444_FUNC(Yuv444ToBgr,      YuvToBgr,      3)
 YUV444_FUNC(Yuv444ToRgba,     YuvToRgba,     4)
 YUV444_FUNC(Yuv444ToBgra,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb,      YuvToRgb,      3)
+YUV444_FUNC(Yuv444ToBgr,      YuvToBgr,      3)
 YUV444_FUNC(Yuv444ToArgb,     YuvToArgb,     4)
 YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
 YUV444_FUNC(Yuv444ToRgb565,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
 
 #undef YUV444_FUNC
 
@@ -258,17 +265,19 @@ YUV444_FUNC(Yuv444ToRgb565,   YuvToRgb565,   2)
 extern void WebPInitYUV444ConvertersMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
-  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
   WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba;
-  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
   WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
   WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb;
   WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
   WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565;
-  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
-  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
   WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb;
   WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/dsp/upsampling_msa.c b/thirdparty/libwebp/src/dsp/upsampling_msa.c
index f24926fa94..535ffb772c 100644
--- a/thirdparty/libwebp/dsp/upsampling_msa.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_msa.c
@@ -12,12 +12,12 @@
 // Author: Prashant Patil (prashant.patil@imgtec.com)
 
 #include <string.h>
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MSA)
 
-#include "./msa_macro.h"
-#include "./yuv.h"
+#include "src/dsp/msa_macro.h"
+#include "src/dsp/yuv.h"
 
 #ifdef FANCY_UPSAMPLING
 
@@ -274,7 +274,7 @@ static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
   const int b = Clip8(b1 >> 6);
   const int rg = (r & 0xf8) | (g >> 5);
   const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   rgb[0] = gb;
   rgb[1] = rg;
 #else
@@ -293,7 +293,7 @@ static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
   const int b = Clip8(b1 >> 6);
   const int rg = (r & 0xf0) | (g >> 4);
   const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   argb[0] = ba;
   argb[1] = rg;
 #else
@@ -374,7 +374,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     STORE16_4(R, G, B, A, dst);
@@ -402,7 +402,7 @@ static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
 static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     STORE16_4(B, G, R, A, dst);
@@ -430,7 +430,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
-  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
   while (length >= 16) {
     CALC_RGB16(y, u, v, R, G, B);
     STORE16_4(A, R, G, B, dst);
@@ -459,11 +459,11 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
                               const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B, RG, BA, tmp0, tmp1;
   while (length >= 16) {
-  #ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
-  #else
+#else
     CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
-  #endif
+#endif
     y      += 16;
     u      += 16;
     v      += 16;
@@ -473,7 +473,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
   if (length > 8) {
     uint8_t temp[2 * 16] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
 #else
     CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
@@ -482,7 +482,7 @@ static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
   } else if (length > 0) {
     uint8_t temp[2 * 8] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
 #else
     CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
@@ -495,11 +495,11 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
                             const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B, RG, GB, tmp0, tmp1;
   while (length >= 16) {
-  #ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGB565(y, u, v, GB, RG, 16, dst);
-  #else
+#else
     CALC_RGB565(y, u, v, RG, GB, 16, dst);
-  #endif
+#endif
     y      += 16;
     u      += 16;
     v      += 16;
@@ -509,7 +509,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
   if (length > 8) {
     uint8_t temp[2 * 16] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGB565(temp, u, v, GB, RG, 16, temp);
 #else
     CALC_RGB565(temp, u, v, RG, GB, 16, temp);
@@ -518,7 +518,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
   } else if (length > 0) {
     uint8_t temp[2 * 8] = { 0 };
     memcpy(temp, y, length * sizeof(*temp));
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
     CALC_RGB565(temp, u, v, GB, RG, 8, temp);
 #else
     CALC_RGB565(temp, u, v, RG, GB, 8, temp);
@@ -640,13 +640,15 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
   }                                                                      \
 }
 
-UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
-UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
 UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+#if !defined(WEBP_REDUCE_CSP)
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
 UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
 UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+#endif   // WEBP_REDUCE_CSP
 
 //------------------------------------------------------------------------------
 // Entry point
@@ -656,17 +658,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 extern void WebPInitUpsamplersMSA(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
-  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
   WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
   WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
   WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
   WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
diff --git a/thirdparty/libwebp/dsp/upsampling_neon.c b/thirdparty/libwebp/src/dsp/upsampling_neon.c
index d371a834ff..17cbc9f911 100644
--- a/thirdparty/libwebp/dsp/upsampling_neon.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_neon.c
@@ -12,15 +12,15 @@
 // Author: mans@mansr.com (Mans Rullgard)
 // Based on SSE code by: somnath@google.com (Somnath Banerjee)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_NEON)
 
 #include <assert.h>
 #include <arm_neon.h>
 #include <string.h>
-#include "./neon.h"
-#include "./yuv.h"
+#include "src/dsp/neon.h"
+#include "src/dsp/yuv.h"
 
 #ifdef FANCY_UPSAMPLING
 
@@ -58,8 +58,8 @@
 } while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
-                             uint8_t *out) {
+static void Upsample16Pixels_NEON(const uint8_t *r1, const uint8_t *r2,
+                                  uint8_t *out) {
   UPSAMPLE_16PIXELS(r1, r2, out);
 }
 
@@ -70,7 +70,7 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
   /* replicate last byte */                                             \
   memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
   memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
-  Upsample16Pixels(r1, r2, out);                                        \
+  Upsample16Pixels_NEON(r1, r2, out);                                   \
 }
 
 //-----------------------------------------------------------------------------
@@ -243,13 +243,15 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
 }
 
 // NEON variants of the fancy upsampler.
-NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair,  Rgb,  3)
-NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair,  Bgr,  3)
-NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
-NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
-NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4)
-NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2)
-NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2)
+NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4)
+NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4)
+#if !defined(WEBP_REDUCE_CSP)
+NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON,  Rgb,  3)
+NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON,  Bgr,  3)
+NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4)
+NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2)
+NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2)
+#endif   // WEBP_REDUCE_CSP
 
 //------------------------------------------------------------------------------
 // Entry point
@@ -259,17 +261,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 extern void WebPInitUpsamplersNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair_NEON;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair_NEON;
+  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON;
+  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON;
+  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
diff --git a/thirdparty/libwebp/dsp/upsampling_sse2.c b/thirdparty/libwebp/src/dsp/upsampling_sse2.c
index b5b668900f..fd5d303982 100644
--- a/thirdparty/libwebp/dsp/upsampling_sse2.c
+++ b/thirdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -11,14 +11,14 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_SSE2)
 
 #include <assert.h>
 #include <emmintrin.h>
 #include <string.h>
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 #ifdef FANCY_UPSAMPLING
 
@@ -83,13 +83,13 @@
   GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                                \
   /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
-  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
+  PACK_AND_STORE(a, b, diag1, diag2, (out) +      0);  /* store top */         \
+  PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32);  /* store bottom */      \
 }
 
 // Turn the macro into a function for reducing code-size when non-critical
-static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
-                             uint8_t* const out) {
+static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
+                                  uint8_t* const out) {
   UPSAMPLE_32PIXELS(r1, r2, out);
 }
 
@@ -101,30 +101,30 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
   memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels));          \
   memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels));          \
   /* using the shared function instead of the macro saves ~3k code size */     \
-  Upsample32Pixels(r1, r2, out);                                               \
+  Upsample32Pixels_SSE2(r1, r2, out);                                          \
 }
 
 #define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
                     top_dst, bottom_dst, cur_x, num_pixels) {                  \
   int n;                                                                       \
   for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
-         top_dst + ((cur_x) + n) * XSTEP);                                     \
+    FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n],                                 \
+         (top_dst) + ((cur_x) + n) * (XSTEP));                                 \
   }                                                                            \
-  if (bottom_y != NULL) {                                                      \
+  if ((bottom_y) != NULL) {                                                    \
     for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
-           bottom_dst + ((cur_x) + n) * XSTEP);                                \
+      FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n],                  \
+           (bottom_dst) + ((cur_x) + n) * (XSTEP));                            \
     }                                                                          \
   }                                                                            \
 }
 
 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                        top_dst, bottom_dst, cur_x) do {                        \
-  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
-  if (bottom_y != NULL) {                                                      \
-    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
-             bottom_dst + (cur_x) * XSTEP);                                    \
+  FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));   \
+  if ((bottom_y) != NULL) {                                                    \
+    FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64,                    \
+                  (bottom_dst) + (cur_x) * (XSTEP));                           \
   }                                                                            \
 } while (0)
 
@@ -169,13 +169,16 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
 }
 
 // SSE2 variants of the fancy upsampler.
-SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
-SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
-SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
-SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4)
+
+#if !defined(WEBP_REDUCE_CSP)
+SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2,  VP8YuvToRgb,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2,  VP8YuvToBgr,  3)
+SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2)
+SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2)
+#endif   // WEBP_REDUCE_CSP
 
 #undef GET_M
 #undef PACK_AND_STORE
@@ -193,17 +196,19 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 extern void WebPInitUpsamplersSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
-  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
-  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
-  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
-  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
-  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
-  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
-  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
-  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2;
+  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2;
+  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2;
+  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair_SSE2;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair_SSE2;
+  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2;
+  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2;
+  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #endif  // FANCY_UPSAMPLING
@@ -213,29 +218,46 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 extern void WebPInitYUV444ConvertersSSE2(void);
 
-#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
-extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u,             \
-                               const uint8_t* v, uint8_t* dst, int len);       \
+#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
+extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+                   uint8_t* dst, int len);                                     \
 static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
                       uint8_t* dst, int len) {                                 \
   int i;                                                                       \
   const int max_len = len & ~31;                                               \
-  for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
+  for (i = 0; i < max_len; i += 32) {                                          \
+    CALL(y + i, u + i, v + i, dst + i * (XSTEP));                              \
+  }                                                                            \
   if (i < len) {  /* C-fallback */                                             \
-    WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i);         \
+    CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i);                   \
   }                                                                            \
 }
 
-YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
-YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
-YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
-YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
+YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
+YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
+YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
+YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
+YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
+            WebPYuv444ToRgba4444_C, 2)
+YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
+#endif   // WEBP_REDUCE_CSP
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
-  WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
-  WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
-  WebPYUV444Converters[MODE_RGB]  = Yuv444ToRgb;
-  WebPYUV444Converters[MODE_BGR]  = Yuv444ToBgr;
+  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba_SSE2;
+  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra_SSE2;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba_SSE2;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra_SSE2;
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb_SSE2;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr_SSE2;
+  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb_SSE2;
+  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
+  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565_SSE2;
+  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb_SSE2;
+  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
+#endif   // WEBP_REDUCE_CSP
 }
 
 #else
diff --git a/thirdparty/libwebp/dsp/yuv.c b/thirdparty/libwebp/src/dsp/yuv.c
index dd7d9dedfa..bddf81fe09 100644
--- a/thirdparty/libwebp/dsp/yuv.c
+++ b/thirdparty/libwebp/src/dsp/yuv.c
@@ -11,63 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
+#include <assert.h>
 #include <stdlib.h>
 
-#if defined(WEBP_YUV_USE_TABLE)
-
-static int done = 0;
-
-static WEBP_INLINE uint8_t clip(int v, int max_value) {
-  return v < 0 ? 0 : v > max_value ? max_value : v;
-}
-
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
-  int i;
-  if (done) {
-    return;
-  }
-#ifndef USE_YUVj
-  for (i = 0; i < 256; ++i) {
-    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
-    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
-    VP8kVToG[i] = -45773 * (i - 128);
-    VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
-  }
-  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
-    const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
-    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
-    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
-  }
-#else
-  for (i = 0; i < 256; ++i) {
-    VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
-    VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
-    VP8kVToG[i] = -46802 * (i - 128);
-    VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
-  }
-  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
-    const int k = i;
-    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
-    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
-  }
-#endif
-
-  done = 1;
-}
-
-#else
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
-
-#endif  // WEBP_YUV_USE_TABLE
-
 //-----------------------------------------------------------------------------
 // Plain-C version
 
@@ -75,14 +23,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
 static void FUNC_NAME(const uint8_t* y,                                        \
                       const uint8_t* u, const uint8_t* v,                      \
                       uint8_t* dst, int len) {                                 \
-  const uint8_t* const end = dst + (len & ~1) * XSTEP;                         \
+  const uint8_t* const end = dst + (len & ~1) * (XSTEP);                       \
   while (dst != end) {                                                         \
     FUNC(y[0], u[0], v[0], dst);                                               \
-    FUNC(y[1], u[0], v[0], dst + XSTEP);                                       \
+    FUNC(y[1], u[0], v[0], dst + (XSTEP));                                     \
     y += 2;                                                                    \
     ++u;                                                                       \
     ++v;                                                                       \
-    dst += 2 * XSTEP;                                                          \
+    dst += 2 * (XSTEP);                                                        \
   }                                                                            \
   if (len & 1) {                                                               \
     FUNC(y[0], u[0], v[0], dst);                                               \
@@ -168,7 +116,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
 //-----------------------------------------------------------------------------
 // ARGB -> YUV converters
 
-static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const uint32_t p = argb[i];
@@ -220,14 +168,14 @@ void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 
 //-----------------------------------------------------------------------------
 
-static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
   int i;
   for (i = 0; i < width; ++i, rgb += 3) {
     y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
   }
 }
 
-static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
   int i;
   for (i = 0; i < width; ++i, bgr += 3) {
     y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
@@ -246,6 +194,7 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 
 //-----------------------------------------------------------------------------
 
+#if !WEBP_NEON_OMIT_C_CODE
 #define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
 static uint16_t clip_y(int v) {
   return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
@@ -283,6 +232,7 @@ static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
     out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
   }
 }
+#endif  // !WEBP_NEON_OMIT_C_CODE
 
 #undef MAX_Y
 
@@ -308,22 +258,26 @@ static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
     (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
 
 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitConvertARGBToYUVNEON(void);
 extern void WebPInitSharpYUVSSE2(void);
+extern void WebPInitSharpYUVNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
   if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  WebPConvertARGBToY = ConvertARGBToY;
+  WebPConvertARGBToY = ConvertARGBToY_C;
   WebPConvertARGBToUV = WebPConvertARGBToUV_C;
 
-  WebPConvertRGB24ToY = ConvertRGB24ToY;
-  WebPConvertBGR24ToY = ConvertBGR24ToY;
+  WebPConvertRGB24ToY = ConvertRGB24ToY_C;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_C;
 
   WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
 
+#if !WEBP_NEON_OMIT_C_CODE
   WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
   WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
   WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
+#endif
 
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -333,5 +287,23 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
     }
 #endif  // WEBP_USE_SSE2
   }
+
+#if defined(WEBP_USE_NEON)
+  if (WEBP_NEON_OMIT_C_CODE ||
+      (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
+    WebPInitConvertARGBToYUVNEON();
+    WebPInitSharpYUVNEON();
+  }
+#endif  // WEBP_USE_NEON
+
+  assert(WebPConvertARGBToY != NULL);
+  assert(WebPConvertARGBToUV != NULL);
+  assert(WebPConvertRGB24ToY != NULL);
+  assert(WebPConvertBGR24ToY != NULL);
+  assert(WebPConvertRGBA32ToUV != NULL);
+  assert(WebPSharpYUVUpdateY != NULL);
+  assert(WebPSharpYUVUpdateRGB != NULL);
+  assert(WebPSharpYUVFilterRow != NULL);
+
   rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/thirdparty/libwebp/dsp/yuv.h b/thirdparty/libwebp/src/dsp/yuv.h
index 1d33b5863b..c8a55832d4 100644
--- a/thirdparty/libwebp/dsp/yuv.h
+++ b/thirdparty/libwebp/src/dsp/yuv.h
@@ -35,18 +35,8 @@
 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_
 
-#include "./dsp.h"
-#include "../dec/vp8_dec.h"
-
-#if defined(WEBP_EXPERIMENTAL_FEATURES)
-// Do NOT activate this feature for real compression. This is only experimental!
-// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
-// This colorspace is close to Rec.601's Y'CbCr model with the notable
-// difference of allowing larger range for luma/chroma.
-// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
-// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
-// #define USE_YUVj
-#endif
+#include "src/dsp/dsp.h"
+#include "src/dec/vp8_dec.h"
 
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion
@@ -58,12 +48,8 @@ extern "C" {
 enum {
   YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
   YUV_HALF = 1 << (YUV_FIX - 1),
-  YUV_MASK = (256 << YUV_FIX) - 1,
-  YUV_RANGE_MIN = -227,            // min value of r/g/b output
-  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
 
   YUV_FIX2 = 6,                   // fixed-point precision for YUV->RGB
-  YUV_HALF2 = 1 << YUV_FIX2 >> 1,
   YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
 
@@ -111,7 +97,7 @@ static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
   const int b = VP8YUVToB(y, u);      // 5 usable bits
   const int rg = (r & 0xf8) | (g >> 5);
   const int gb = ((g << 3) & 0xe0) | (b >> 3);
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   rgb[0] = gb;
   rgb[1] = rg;
 #else
@@ -127,7 +113,7 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
   const int b = VP8YUVToB(y, u);        // 4 usable bits
   const int rg = (r & 0xf0) | (g >> 4);
   const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
-#ifdef WEBP_SWAP_16BIT_CSP
+#if (WEBP_SWAP_16BIT_CSP == 1)
   argb[0] = ba;
   argb[1] = rg;
 #else
@@ -157,29 +143,26 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
   rgba[3] = 0xff;
 }
 
-// Must be called before everything, to initialize the tables.
-void VP8YUVInit(void);
-
 //-----------------------------------------------------------------------------
 // SSE2 extra functions (mostly for upsampling_sse2.c)
 
 #if defined(WEBP_USE_SSE2)
 
 // Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst);
-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst);
-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst);
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                         uint8_t* dst);
-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                      uint8_t* dst);
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+                             const uint8_t* v, uint8_t* dst);
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst);
 
 #endif    // WEBP_USE_SSE2
 
@@ -192,8 +175,6 @@ static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
   return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
 }
 
-#ifndef USE_YUVj
-
 static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
   const int luma = 16839 * r + 33059 * g + 6420 * b;
   return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
@@ -209,28 +190,6 @@ static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
   return VP8ClipUV(v, rounding);
 }
 
-#else
-
-// This JPEG-YUV colorspace, only for comparison!
-// These are also 16bit precision coefficients from Rec.601, but with full
-// [0..255] output range.
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
-  const int luma = 19595 * r + 38470 * g + 7471 * b;
-  return (luma + rounding) >> YUV_FIX;  // no need to clip
-}
-
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
-  const int u = -11058 * r - 21710 * g + 32768 * b;
-  return VP8ClipUV(u, rounding);
-}
-
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
-  const int v = 32768 * r - 27439 * g - 5329 * b;
-  return VP8ClipUV(v, rounding);
-}
-
-#endif    // USE_YUVj
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/thirdparty/libwebp/dsp/yuv_mips32.c b/thirdparty/libwebp/src/dsp/yuv_mips32.c
index e61aac571f..9d0a887824 100644
--- a/thirdparty/libwebp/dsp/yuv_mips32.c
+++ b/thirdparty/libwebp/src/dsp/yuv_mips32.c
@@ -12,11 +12,11 @@
 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 //------------------------------------------------------------------------------
 // simple point-sampling
@@ -77,10 +77,10 @@ static void FUNC_NAME(const uint8_t* y,                                        \
   }                                                                            \
 }
 
-ROW_FUNC(YuvToRgbRow,      3, 0, 1, 2, 0)
-ROW_FUNC(YuvToRgbaRow,     4, 0, 1, 2, 3)
-ROW_FUNC(YuvToBgrRow,      3, 2, 1, 0, 0)
-ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
+ROW_FUNC(YuvToRgbRow_MIPS32,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPS32,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPS32,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPS32,     4, 2, 1, 0, 3)
 
 #undef ROW_FUNC
 
@@ -90,10 +90,10 @@ ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
 extern void WebPInitSamplersMIPS32(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
-  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
-  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
-  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
-  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPS32;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPS32;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
 }
 
 #else  // !WEBP_USE_MIPS32
diff --git a/thirdparty/libwebp/dsp/yuv_mips_dsp_r2.c b/thirdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
index 1720d4190f..cc8afcc756 100644
--- a/thirdparty/libwebp/dsp/yuv_mips_dsp_r2.c
+++ b/thirdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
@@ -12,11 +12,11 @@
 // Author(s):  Branimir Vasic (branimir.vasic@imgtec.com)
 //             Djordje Pesut  (djordje.pesut@imgtec.com)
 
-#include "./dsp.h"
+#include "src/dsp/dsp.h"
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 //------------------------------------------------------------------------------
 // simple point-sampling
@@ -105,10 +105,10 @@ static void FUNC_NAME(const uint8_t* y,                                        \
   }                                                                            \
 }
 
-ROW_FUNC(YuvToRgbRow,      3, 0, 1, 2, 0)
-ROW_FUNC(YuvToRgbaRow,     4, 0, 1, 2, 3)
-ROW_FUNC(YuvToBgrRow,      3, 2, 1, 0, 0)
-ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
+ROW_FUNC(YuvToRgbRow_MIPSdspR2,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow_MIPSdspR2,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow_MIPSdspR2,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow_MIPSdspR2,     4, 2, 1, 0, 3)
 
 #undef ROW_FUNC
 #undef ASM_CLOBBER_LIST
@@ -121,10 +121,10 @@ ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
 extern void WebPInitSamplersMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
-  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
-  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
-  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
-  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_MIPSdspR2;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_MIPSdspR2;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/thirdparty/libwebp/src/dsp/yuv_neon.c b/thirdparty/libwebp/src/dsp/yuv_neon.c
new file mode 100644
index 0000000000..a34d60248f
--- /dev/null
+++ b/thirdparty/libwebp/src/dsp/yuv_neon.c
@@ -0,0 +1,288 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/yuv.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "src/dsp/neon.h"
+
+//-----------------------------------------------------------------------------
+
+static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
+                                    const uint8x8_t G,
+                                    const uint8x8_t B) {
+  const uint16x8_t r = vmovl_u8(R);
+  const uint16x8_t g = vmovl_u8(G);
+  const uint16x8_t b = vmovl_u8(B);
+  const uint16x4_t r_lo = vget_low_u16(r);
+  const uint16x4_t r_hi = vget_high_u16(r);
+  const uint16x4_t g_lo = vget_low_u16(g);
+  const uint16x4_t g_hi = vget_high_u16(g);
+  const uint16x4_t b_lo = vget_low_u16(b);
+  const uint16x4_t b_hi = vget_high_u16(b);
+  const uint32x4_t tmp0_lo = vmull_n_u16(         r_lo, 16839u);
+  const uint32x4_t tmp0_hi = vmull_n_u16(         r_hi, 16839u);
+  const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
+  const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
+  const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
+  const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
+  const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
+                                     vrshrn_n_u32(tmp2_hi, 16));
+  const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
+  return vqmovn_u16(Y2);
+}
+
+static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
+    const uint8x8x3_t RGB = vld3_u8(rgb);
+    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i, rgb += 3) {   // left-over
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
+    const uint8x8x3_t BGR = vld3_u8(bgr);
+    const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i, bgr += 3) {  // left-over
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8) {
+    const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
+    const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
+    vst1_u8(y + i, Y);
+  }
+  for (; i < width; ++i) {   // left-over
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
+#define MULTIPLY_16b_PREAMBLE(r, g, b)                           \
+  const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r));  \
+  const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
+  const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g));  \
+  const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
+  const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b));  \
+  const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
+
+#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do {              \
+  const int32x4_t tmp0_lo = vmull_n_s16(         r_lo, C0);      \
+  const int32x4_t tmp0_hi = vmull_n_s16(         r_hi, C0);      \
+  const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1);      \
+  const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1);      \
+  const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2);      \
+  const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2);      \
+  const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16),  \
+                                      vshrn_n_s32(tmp2_hi, 16)); \
+  DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST));                   \
+} while (0)
+
+// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
+#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do {     \
+  MULTIPLY_16b_PREAMBLE(r, g, b);                                \
+  MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST);       \
+  MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
+} while (0)
+
+static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
+                                   uint8_t* u, uint8_t* v, int width) {
+  int i;
+  for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
+    const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
+    int16x8_t U, V;
+    CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
+    vst1_u8(u + i, vqrshrun_n_s16(U, 2));
+    vst1_u8(v + i, vqrshrun_n_s16(V, 2));
+  }
+  for (; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
+    u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+    v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+  }
+}
+
+static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                 int src_width, int do_store) {
+  int i;
+  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
+    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
+    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
+    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
+    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
+    int16x8_t U_tmp, V_tmp;
+    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
+    {
+      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
+      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
+      if (do_store) {
+        vst1_u8(u, U);
+        vst1_u8(v, V);
+      } else {
+        const uint8x8_t prev_u = vld1_u8(u);
+        const uint8x8_t prev_v = vld1_u8(v);
+        vst1_u8(u, vrhadd_u8(U, prev_u));
+        vst1_u8(v, vrhadd_u8(V, prev_v));
+      }
+    }
+  }
+  if (i < src_width) {  // left-over
+    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+  }
+}
+
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
+  WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
+  WebPConvertARGBToY = ConvertARGBToY_NEON;
+  WebPConvertARGBToUV = ConvertARGBToUV_NEON;
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
+}
+
+//------------------------------------------------------------------------------
+
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y_NEON(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len) {
+  int i;
+  const int16x8_t zero = vdupq_n_s16(0);
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  uint64x2_t sum = vdupq_n_u64(0);
+  uint64_t diff;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
+    const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
+    const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
+    const int16x8_t D = vsubq_s16(A, B);       // diff_y
+    const int16x8_t F = vaddq_s16(C, D);       // new_y
+    const uint16x8_t H =
+        vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
+    const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
+    vst1q_u16(dst + i, H);
+    sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
+  }
+  diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)(dst[i]) + diff_y;
+    dst[i] = clip_y_NEON(new_y);
+    diff += (uint64_t)(abs(diff_y));
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t A = vld1q_s16(ref + i);
+    const int16x8_t B = vld1q_s16(src + i);
+    const int16x8_t C = vld1q_s16(dst + i);
+    const int16x8_t D = vsubq_s16(A, B);   // diff_uv
+    const int16x8_t E = vaddq_s16(C, D);   // new_uv
+    vst1q_s16(dst + i, E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out) {
+  int i;
+  const int16x8_t max = vdupq_n_s16(MAX_Y);
+  const int16x8_t zero = vdupq_n_s16(0);
+  for (i = 0; i + 8 <= len; i += 8) {
+    const int16x8_t a0 = vld1q_s16(A + i + 0);
+    const int16x8_t a1 = vld1q_s16(A + i + 1);
+    const int16x8_t b0 = vld1q_s16(B + i + 0);
+    const int16x8_t b1 = vld1q_s16(B + i + 1);
+    const int16x8_t a0b1 = vaddq_s16(a0, b1);
+    const int16x8_t a1b0 = vaddq_s16(a1, b0);
+    const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
+    const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
+    const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
+    const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
+    const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
+    const int16x8_t d0 = vaddq_s16(c1, a0);
+    const int16x8_t d1 = vaddq_s16(c0, a1);
+    const int16x8_t e0 = vrshrq_n_s16(d0, 1);
+    const int16x8_t e1 = vrshrq_n_s16(d1, 1);
+    const int16x8x2_t f = vzipq_s16(e0, e1);
+    const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
+    const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
+    const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
+    const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
+    const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
+    const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
+    vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
+    vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
+  }
+  for (; i < len; ++i) {
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
+  }
+}
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitSharpYUVNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
+  WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
+  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
+  WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
+WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/thirdparty/libwebp/dsp/yuv_sse2.c b/thirdparty/libwebp/src/dsp/yuv_sse2.c
index e33c2bbafd..6810bf8d15 100644
--- a/thirdparty/libwebp/dsp/yuv_sse2.c
+++ b/thirdparty/libwebp/src/dsp/yuv_sse2.c
@@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./yuv.h"
+#include "src/dsp/yuv.h"
 
 #if defined(WEBP_USE_SSE2)
 
-#include "./common_sse2.h"
+#include "src/dsp/common_sse2.h"
 #include <stdlib.h>
 #include <emmintrin.h>
 
@@ -26,12 +26,12 @@
 // R = (19077 * y             + 26149 * v - 14234) >> 6
 // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
 // B = (19077 * y + 33050 * u             - 17685) >> 6
-static void ConvertYUV444ToRGB(const __m128i* const Y0,
-                               const __m128i* const U0,
-                               const __m128i* const V0,
-                               __m128i* const R,
-                               __m128i* const G,
-                               __m128i* const B) {
+static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
+                                    const __m128i* const U0,
+                                    const __m128i* const V0,
+                                    __m128i* const R,
+                                    __m128i* const G,
+                                    __m128i* const B) {
   const __m128i k19077 = _mm_set1_epi16(19077);
   const __m128i k26149 = _mm_set1_epi16(26149);
   const __m128i k14234 = _mm_set1_epi16(14234);
@@ -66,13 +66,13 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0,
 }
 
 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
-static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
+static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
   return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
 }
 
 // Load and replicate the U/V samples
-static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
+static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
   const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
@@ -80,29 +80,33 @@ static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
 }
 
 // Convert 32 samples of YUV444 to R/G/B
-static void YUV444ToRGB(const uint8_t* const y,
-                        const uint8_t* const u,
-                        const uint8_t* const v,
-                        __m128i* const R, __m128i* const G, __m128i* const B) {
-  const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
-  ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
+static void YUV444ToRGB_SSE2(const uint8_t* const y,
+                             const uint8_t* const u,
+                             const uint8_t* const v,
+                             __m128i* const R, __m128i* const G,
+                             __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
+                V0 = Load_HI_16_SSE2(v);
+  ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
 }
 
 // Convert 32 samples of YUV420 to R/G/B
-static void YUV420ToRGB(const uint8_t* const y,
-                        const uint8_t* const u,
-                        const uint8_t* const v,
-                        __m128i* const R, __m128i* const G, __m128i* const B) {
-  const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
-  ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
+static void YUV420ToRGB_SSE2(const uint8_t* const y,
+                             const uint8_t* const u,
+                             const uint8_t* const v,
+                             __m128i* const R, __m128i* const G,
+                             __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
+                V0 = Load_UV_HI_8_SSE2(v);
+  ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
 }
 
 // Pack R/G/B/A results into 32b output.
-static WEBP_INLINE void PackAndStore4(const __m128i* const R,
-                                      const __m128i* const G,
-                                      const __m128i* const B,
-                                      const __m128i* const A,
-                                      uint8_t* const dst) {
+static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
+                                           const __m128i* const G,
+                                           const __m128i* const B,
+                                           const __m128i* const A,
+                                           uint8_t* const dst) {
   const __m128i rb = _mm_packus_epi16(*R, *B);
   const __m128i ga = _mm_packus_epi16(*G, *A);
   const __m128i rg = _mm_unpacklo_epi8(rb, ga);
@@ -114,12 +118,12 @@ static WEBP_INLINE void PackAndStore4(const __m128i* const R,
 }
 
 // Pack R/G/B/A results into 16b output.
-static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
-                                         const __m128i* const G,
-                                         const __m128i* const B,
-                                         const __m128i* const A,
-                                         uint8_t* const dst) {
-#if !defined(WEBP_SWAP_16BIT_CSP)
+static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
+                                              const __m128i* const G,
+                                              const __m128i* const B,
+                                              const __m128i* const A,
+                                              uint8_t* const dst) {
+#if (WEBP_SWAP_16BIT_CSP == 0)
   const __m128i rg0 = _mm_packus_epi16(*R, *G);
   const __m128i ba0 = _mm_packus_epi16(*B, *A);
 #else
@@ -136,10 +140,10 @@ static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
 }
 
 // Pack R/G/B results into 16b output.
-static WEBP_INLINE void PackAndStore565(const __m128i* const R,
-                                        const __m128i* const G,
-                                        const __m128i* const B,
-                                        uint8_t* const dst) {
+static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
+                                             const __m128i* const G,
+                                             const __m128i* const B,
+                                             uint8_t* const dst) {
   const __m128i r0 = _mm_packus_epi16(*R, *R);
   const __m128i g0 = _mm_packus_epi16(*G, *G);
   const __m128i b0 = _mm_packus_epi16(*B, *B);
@@ -149,7 +153,7 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
   const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
   const __m128i rg = _mm_or_si128(r1, g1);
   const __m128i gb = _mm_or_si128(g2, b1);
-#if !defined(WEBP_SWAP_16BIT_CSP)
+#if (WEBP_SWAP_16BIT_CSP == 0)
   const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
 #else
   const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
@@ -160,10 +164,10 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
 // Pack the planar buffers
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
-                                    __m128i* const in2, __m128i* const in3,
-                                    __m128i* const in4, __m128i* const in5,
-                                    uint8_t* const rgb) {
+static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
+                                         __m128i* const in2, __m128i* const in3,
+                                         __m128i* const in4, __m128i* const in5,
+                                         uint8_t* const rgb) {
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // let's take 6 registers of four 8b values.
   // To pack, we will keep taking one every two 8b integer and move it
@@ -186,69 +190,69 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
   _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
 }
 
-void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4(&R, &G, &B, &kAlpha, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
   }
 }
 
-void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4(&B, &G, &R, &kAlpha, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
   }
 }
 
-void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                    uint8_t* dst) {
+void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4(&kAlpha, &R, &G, &B, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
   }
 }
 
-void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst) {
+void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
+                             const uint8_t* v, uint8_t* dst) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n < 32; n += 8, dst += 16) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore4444(&R, &G, &B, &kAlpha, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
   }
 }
 
-void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                      uint8_t* dst) {
+void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                           uint8_t* dst) {
   int n;
   for (n = 0; n < 32; n += 8, dst += 16) {
     __m128i R, G, B;
-    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
-    PackAndStore565(&R, &G, &B, dst);
+    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore565_SSE2(&R, &G, &B, dst);
   }
 }
 
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
-  YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
-  YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
-  YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
-  YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+  YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as RRRRGGGGBBBB.
   rgb0 = _mm_packus_epi16(R0, R1);
@@ -259,18 +263,18 @@ void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   rgb5 = _mm_packus_epi16(B2, B3);
 
   // Pack as RGBRGBRGBRGB.
-  PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+  PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
 
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
   __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
-  YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-  YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
-  YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
-  YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+  YUV444ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE2(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as BBBBGGGGRRRR.
   bgr0 = _mm_packus_epi16(B0, B1);
@@ -281,20 +285,21 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   bgr5= _mm_packus_epi16(R2, R3);
 
   // Pack as BGRBGRBGRBGR.
-  PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+  PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 }
 
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
 
-static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst, int len) {
+static void YuvToRgbaRow_SSE2(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV420ToRGB(y, u, v, &R, &G, &B);
-    PackAndStore4(&R, &G, &B, &kAlpha, dst);
+    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+    PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
     y += 8;
     u += 4;
     v += 4;
@@ -308,14 +313,15 @@ static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst, int len) {
+static void YuvToBgraRow_SSE2(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV420ToRGB(y, u, v, &R, &G, &B);
-    PackAndStore4(&B, &G, &R, &kAlpha, dst);
+    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+    PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
     y += 8;
     u += 4;
     v += 4;
@@ -329,14 +335,15 @@ static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                         uint8_t* dst, int len) {
+static void YuvToArgbRow_SSE2(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
   const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
   for (n = 0; n + 8 <= len; n += 8, dst += 32) {
     __m128i R, G, B;
-    YUV420ToRGB(y, u, v, &R, &G, &B);
-    PackAndStore4(&kAlpha, &R, &G, &B, dst);
+    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
+    PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
     y += 8;
     u += 4;
     v += 4;
@@ -350,17 +357,18 @@ static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst, int len) {
+static void YuvToRgbRow_SSE2(const uint8_t* y,
+                             const uint8_t* u, const uint8_t* v,
+                             uint8_t* dst, int len) {
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
-    YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-    YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
-    YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
-    YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+    YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as RRRRGGGGBBBB.
     rgb0 = _mm_packus_epi16(R0, R1);
@@ -371,7 +379,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     rgb5 = _mm_packus_epi16(B2, B3);
 
     // Pack as RGBRGBRGBRGB.
-    PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+    PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 
     y += 32;
     u += 16;
@@ -386,17 +394,18 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   }
 }
 
-static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                        uint8_t* dst, int len) {
+static void YuvToBgrRow_SSE2(const uint8_t* y,
+                             const uint8_t* u, const uint8_t* v,
+                             uint8_t* dst, int len) {
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
     __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
-    YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-    YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
-    YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
-    YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+    YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as BBBBGGGGRRRR.
     bgr0 = _mm_packus_epi16(B0, B1);
@@ -407,7 +416,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     bgr5 = _mm_packus_epi16(R2, R3);
 
     // Pack as BGRBGRBGRBGR.
-    PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+    PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 
     y += 32;
     u += 16;
@@ -428,11 +437,11 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 extern void WebPInitSamplersSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
-  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
-  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
-  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
-  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
-  WebPSamplers[MODE_ARGB] = YuvToArgbRow;
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE2;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE2;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
+  WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
 }
 
 //------------------------------------------------------------------------------
@@ -445,7 +454,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
 
 // Function that inserts a value of the second half of the in buffer in between
 // every two char of the first half.
-static WEBP_INLINE void RGB24PackedToPlanarHelper(
+static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
     const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
   out[0] = _mm_unpacklo_epi8(in[0], in[3]);
   out[1] = _mm_unpackhi_epi8(in[0], in[3]);
@@ -458,8 +467,8 @@ static WEBP_INLINE void RGB24PackedToPlanarHelper(
 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // Similar to PlanarTo24bHelper(), but in reverse order.
-static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
-                                            __m128i* const out /*out[6]*/) {
+static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
+    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
   __m128i tmp[6];
   tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
   tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
@@ -468,16 +477,16 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
   tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
   tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
 
-  RGB24PackedToPlanarHelper(tmp, out);
-  RGB24PackedToPlanarHelper(out, tmp);
-  RGB24PackedToPlanarHelper(tmp, out);
-  RGB24PackedToPlanarHelper(out, tmp);
-  RGB24PackedToPlanarHelper(tmp, out);
+  RGB24PackedToPlanarHelper_SSE2(tmp, out);
+  RGB24PackedToPlanarHelper_SSE2(out, tmp);
+  RGB24PackedToPlanarHelper_SSE2(tmp, out);
+  RGB24PackedToPlanarHelper_SSE2(out, tmp);
+  RGB24PackedToPlanarHelper_SSE2(tmp, out);
 }
 
 // Convert 8 packed ARGB to r[], g[], b[]
-static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
-                                            __m128i* const rgb /*in[6]*/) {
+static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
+                                                 __m128i* const rgb /*in[6]*/) {
   const __m128i zero = _mm_setzero_si128();
   __m128i a0 = LOAD_16(argb + 0);
   __m128i a1 = LOAD_16(argb + 4);
@@ -511,10 +520,10 @@ static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
 } while (0)
 
 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
-static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
-                                      const __m128i* const G,
-                                      const __m128i* const B,
-                                      __m128i* const Y) {
+static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
+                                           const __m128i* const G,
+                                           const __m128i* const B,
+                                           __m128i* const Y) {
   const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
   const __m128i kGB_y = MK_CST_16(16384, 6420);
   const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
@@ -526,10 +535,11 @@ static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
   TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
 }
 
-static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
-                                       const __m128i* const G,
-                                       const __m128i* const B,
-                                       __m128i* const U, __m128i* const V) {
+static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
+                                            const __m128i* const G,
+                                            const __m128i* const B,
+                                            __m128i* const U,
+                                            __m128i* const V) {
   const __m128i kRG_u = MK_CST_16(-9719, -19081);
   const __m128i kGB_u = MK_CST_16(0, 28800);
   const __m128i kRG_v = MK_CST_16(28800, 0);
@@ -549,14 +559,14 @@ static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
 #undef MK_CST_16
 #undef TRANSFORM
 
-static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
   const int max_width = width & ~31;
   int i;
   for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
     __m128i rgb_plane[6];
     int j;
 
-    RGB24PackedToPlanar(rgb, rgb_plane);
+    RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
 
     for (j = 0; j < 2; ++j, i += 16) {
       const __m128i zero = _mm_setzero_si128();
@@ -566,13 +576,13 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
       r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
       g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
       b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y0);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
 
       // Convert to 16-bit Y.
       r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
       g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
       b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y1);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
 
       // Cast to 8-bit and store.
       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@@ -583,14 +593,14 @@ static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
   }
 }
 
-static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
   const int max_width = width & ~31;
   int i;
   for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
     __m128i bgr_plane[6];
     int j;
 
-    RGB24PackedToPlanar(bgr, bgr_plane);
+    RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
 
     for (j = 0; j < 2; ++j, i += 16) {
       const __m128i zero = _mm_setzero_si128();
@@ -600,13 +610,13 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
       b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
       g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
       r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y0);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
 
       // Convert to 16-bit Y.
       b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
       g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
       r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
-      ConvertRGBToY(&r, &g, &b, &Y1);
+      ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
 
       // Cast to 8-bit and store.
       STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
@@ -617,14 +627,14 @@ static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
   }
 }
 
-static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
   const int max_width = width & ~15;
   int i;
   for (i = 0; i < max_width; i += 16) {
     __m128i Y0, Y1, rgb[6];
-    RGB32PackedToPlanar(&argb[i], rgb);
-    ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
-    ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
+    RGB32PackedToPlanar_SSE2(&argb[i], rgb);
+    ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
+    ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
     STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
   }
   for (; i < width; ++i) {   // left-over
@@ -636,31 +646,33 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
 
 // Horizontal add (doubled) of two 16b values, result is 16b.
 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
-static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
-                              __m128i* const out) {
+static void HorizontalAddPack_SSE2(const __m128i* const A,
+                                   const __m128i* const B,
+                                   __m128i* const out) {
   const __m128i k2 = _mm_set1_epi16(2);
   const __m128i C = _mm_madd_epi16(*A, k2);
   const __m128i D = _mm_madd_epi16(*B, k2);
   *out = _mm_packs_epi32(C, D);
 }
 
-static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
-                            int src_width, int do_store) {
+static void ConvertARGBToUV_SSE2(const uint32_t* argb,
+                                 uint8_t* u, uint8_t* v,
+                                 int src_width, int do_store) {
   const int max_width = src_width & ~31;
   int i;
   for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
     __m128i rgb[6], U0, V0, U1, V1;
-    RGB32PackedToPlanar(&argb[i], rgb);
-    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
-    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
-    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
-    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
-
-    RGB32PackedToPlanar(&argb[i + 16], rgb);
-    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
-    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
-    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
-    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
+    RGB32PackedToPlanar_SSE2(&argb[i], rgb);
+    HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+    RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
+    HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
 
     U0 = _mm_packus_epi16(U0, U1);
     V0 = _mm_packus_epi16(V0, V1);
@@ -679,10 +691,9 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
 }
 
 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
-static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
-                                                 __m128i* const r,
-                                                 __m128i* const g,
-                                                 __m128i* const b) {
+static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
+    const uint16_t* const rgbx,
+    __m128i* const r, __m128i* const g, __m128i* const b) {
   const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
   const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
   const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
@@ -701,16 +712,16 @@ static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
   *b = _mm_unpacklo_epi64(B1, B3);
 }
 
-static void ConvertRGBA32ToUV(const uint16_t* rgb,
-                              uint8_t* u, uint8_t* v, int width) {
+static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
+                                   uint8_t* u, uint8_t* v, int width) {
   const int max_width = width & ~15;
   const uint16_t* const last_rgb = rgb + 4 * max_width;
   while (rgb < last_rgb) {
     __m128i r, g, b, U0, V0, U1, V1;
-    RGBA32PackedToPlanar_16b(rgb +  0, &r, &g, &b);
-    ConvertRGBToUV(&r, &g, &b, &U0, &V0);
-    RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
-    ConvertRGBToUV(&r, &g, &b, &U1, &V1);
+    RGBA32PackedToPlanar_16b_SSE2(rgb +  0, &r, &g, &b);
+    ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
+    RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
+    ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
     STORE_16(_mm_packus_epi16(U0, U1), u);
     STORE_16(_mm_packus_epi16(V0, V1), v);
     u += 16;
@@ -727,13 +738,13 @@ static void ConvertRGBA32ToUV(const uint16_t* rgb,
 extern void WebPInitConvertARGBToYUVSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
-  WebPConvertARGBToY = ConvertARGBToY;
-  WebPConvertARGBToUV = ConvertARGBToUV;
+  WebPConvertARGBToY = ConvertARGBToY_SSE2;
+  WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
 
-  WebPConvertRGB24ToY = ConvertRGB24ToY;
-  WebPConvertBGR24ToY = ConvertBGR24ToY;
+  WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
 
-  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
 }
 
 //------------------------------------------------------------------------------
diff --git a/thirdparty/libwebp/enc/alpha_enc.c b/thirdparty/libwebp/src/enc/alpha_enc.c
index 5a2c931f92..7e8d87f22e 100644
--- a/thirdparty/libwebp/enc/alpha_enc.c
+++ b/thirdparty/libwebp/src/enc/alpha_enc.c
@@ -14,12 +14,12 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
-#include "../utils/filters_utils.h"
-#include "../utils/quant_levels_utils.h"
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/filters_utils.h"
+#include "src/utils/quant_levels_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
 // Encodes the given alpha data via specified compression method 'method'.
@@ -44,11 +44,11 @@
 //           invalid quality or method, or
 //           memory allocation for the compressed data fails.
 
-#include "../enc/vp8li_enc.h"
+#include "src/enc/vp8li_enc.h"
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
-                          VP8LBitWriter* const bw,
+                          int use_quality_100, VP8LBitWriter* const bw,
                           WebPAuxStats* const stats) {
   int ok = 0;
   WebPConfig config;
@@ -76,7 +76,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   // Set a low default quality for encoding alpha. Ensure that Alpha quality at
   // lower methods (3 and below) is less than the threshold for triggering
   // costly 'BackwardReferencesTraceBackwards'.
-  config.quality = 8.f * effort_level;
+  // If the alpha quality is set to 100 and the method to 6, allow for a high
+  // lossless quality to trigger the cruncher.
+  config.quality =
+      (use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
   // TODO(urvang): Temporary fix to avoid generating images that trigger
@@ -134,7 +137,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   if (method != ALPHA_NO_COMPRESSION) {
     ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
     ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
-                              &tmp_bw, &result->stats);
+                              !reduce_levels, &tmp_bw, &result->stats);
     if (ok) {
       output = VP8LBitWriterFinish(&tmp_bw);
       output_size = VP8LBitWriterNumBytes(&tmp_bw);
@@ -264,6 +267,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
                              reduce_levels, effort_level, NULL, &best);
   }
   if (ok) {
+#if !defined(WEBP_DISABLE_STATS)
     if (stats != NULL) {
       stats->lossless_features = best.stats.lossless_features;
       stats->histogram_bits = best.stats.histogram_bits;
@@ -274,6 +278,9 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
       stats->lossless_hdr_size = best.stats.lossless_hdr_size;
       stats->lossless_data_size = best.stats.lossless_data_size;
     }
+#else
+    (void)stats;
+#endif
     *output_size = VP8BitWriterSize(&best.bw);
     *output = VP8BitWriterBuf(&best.bw);
   } else {
@@ -339,10 +346,12 @@ static int EncodeAlpha(VP8Encoder* const enc,
     ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
                                filter, reduce_levels, effort_level, output,
                                output_size, pic->stats);
+#if !defined(WEBP_DISABLE_STATS)
     if (pic->stats != NULL) {  // need stats?
       pic->stats->coded_size += (int)(*output_size);
       enc->sse_[3] = sse;
     }
+#endif
   }
 
   WebPSafeFree(quant_alpha);
diff --git a/thirdparty/libwebp/enc/analysis_enc.c b/thirdparty/libwebp/src/enc/analysis_enc.c
index dce159b316..08f471f5f8 100644
--- a/thirdparty/libwebp/enc/analysis_enc.c
+++ b/thirdparty/libwebp/src/enc/analysis_enc.c
@@ -15,9 +15,9 @@
 #include <string.h>
 #include <assert.h>
 
-#include "./vp8i_enc.h"
-#include "./cost_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
+#include "src/utils/utils.h"
 
 #define MAX_ITERS_K_MEANS  6
 
diff --git a/thirdparty/libwebp/src/enc/backward_references_cost_enc.c b/thirdparty/libwebp/src/enc/backward_references_cost_enc.c
new file mode 100644
index 0000000000..7175496c7f
--- /dev/null
+++ b/thirdparty/libwebp/src/enc/backward_references_cost_enc.c
@@ -0,0 +1,790 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Improves a given set of backward references by analyzing its bit cost.
+// The algorithm is similar to the Zopfli compression algorithm but tailored to
+// images.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+//
+
+#include <assert.h>
+
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
+
+#define VALUES_IN_BYTE 256
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+
+typedef struct {
+  double alpha_[VALUES_IN_BYTE];
+  double red_[VALUES_IN_BYTE];
+  double blue_[VALUES_IN_BYTE];
+  double distance_[NUM_DISTANCE_CODES];
+  double* literal_;
+} CostModel;
+
+static void ConvertPopulationCountTableToBitEstimates(
+    int num_symbols, const uint32_t population_counts[], double output[]) {
+  uint32_t sum = 0;
+  int nonzeros = 0;
+  int i;
+  for (i = 0; i < num_symbols; ++i) {
+    sum += population_counts[i];
+    if (population_counts[i] > 0) {
+      ++nonzeros;
+    }
+  }
+  if (nonzeros <= 1) {
+    memset(output, 0, num_symbols * sizeof(*output));
+  } else {
+    const double logsum = VP8LFastLog2(sum);
+    for (i = 0; i < num_symbols; ++i) {
+      output[i] = logsum - VP8LFastLog2(population_counts[i]);
+    }
+  }
+}
+
+static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
+                          const VP8LBackwardRefs* const refs) {
+  int ok = 0;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
+
+  // The following code is similar to VP8LHistogramCreate but converts the
+  // distance to plane code.
+  VP8LHistogramInit(histo, cache_bits);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
+                                    xsize);
+    VP8LRefsCursorNext(&c);
+  }
+
+  ConvertPopulationCountTableToBitEstimates(
+      VP8LHistogramNumCodes(histo->palette_code_bits_),
+      histo->literal_, m->literal_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->red_, m->red_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->blue_, m->blue_);
+  ConvertPopulationCountTableToBitEstimates(
+      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
+  ConvertPopulationCountTableToBitEstimates(
+      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
+  ok = 1;
+
+ Error:
+  VP8LFreeHistogram(histo);
+  return ok;
+}
+
+static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
+  return m->alpha_[v >> 24] +
+         m->red_[(v >> 16) & 0xff] +
+         m->literal_[(v >> 8) & 0xff] +
+         m->blue_[v & 0xff];
+}
+
+static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
+  const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
+  return m->literal_[literal_idx];
+}
+
+static WEBP_INLINE double GetLengthCost(const CostModel* const m,
+                                        uint32_t length) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(length, &code, &extra_bits);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
+}
+
+static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
+                                          uint32_t distance) {
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+  return m->distance_[code] + extra_bits;
+}
+
+static WEBP_INLINE void AddSingleLiteralWithCostModel(
+    const uint32_t* const argb, VP8LColorCache* const hashers,
+    const CostModel* const cost_model, int idx, int use_color_cache,
+    float prev_cost, float* const cost, uint16_t* const dist_array) {
+  double cost_val = prev_cost;
+  const uint32_t color = argb[idx];
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
+    const double mul0 = 0.68;
+    cost_val += GetCacheCost(cost_model, ix) * mul0;
+  } else {
+    const double mul1 = 0.82;
+    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+    cost_val += GetLiteralCost(cost_model, color) * mul1;
+  }
+  if (cost[idx] > cost_val) {
+    cost[idx] = (float)cost_val;
+    dist_array[idx] = 1;  // only one is inserted.
+  }
+}
+
+// -----------------------------------------------------------------------------
+// CostManager and interval handling
+
+// Empirical value to avoid high memory consumption but good for performance.
+#define COST_CACHE_INTERVAL_SIZE_MAX 500
+
+// To perform backward reference every pixel at index index_ is considered and
+// the cost for the MAX_LENGTH following pixels computed. Those following pixels
+// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
+//     cost_ = distance cost at index + GetLengthCost(cost_model, k)
+// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
+// array of size MAX_LENGTH.
+// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
+// minimal values using intervals of constant cost.
+// An interval is defined by the index_ of the pixel that generated it and
+// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
+// it contains the minimum value for pixels between start_ and end_.
+// Intervals are stored in a linked list and ordered by start_. When a new
+// interval has a better value, old intervals are split or removed. There are
+// therefore no overlapping intervals.
+typedef struct CostInterval CostInterval;
+struct CostInterval {
+  float cost_;
+  int start_;
+  int end_;
+  int index_;
+  CostInterval* previous_;
+  CostInterval* next_;
+};
+
+// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
+typedef struct {
+  double cost_;
+  int start_;
+  int end_;       // Exclusive.
+} CostCacheInterval;
+
+// This structure is in charge of managing intervals and costs.
+// It caches the different CostCacheInterval, caches the different
+// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
+// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
+#define COST_MANAGER_MAX_FREE_LIST 10
+typedef struct {
+  CostInterval* head_;
+  int count_;  // The number of stored intervals.
+  CostCacheInterval* cache_intervals_;
+  size_t cache_intervals_size_;
+  double cost_cache_[MAX_LENGTH];  // Contains the GetLengthCost(cost_model, k).
+  float* costs_;
+  uint16_t* dist_array_;
+  // Most of the time, we only need few intervals -> use a free-list, to avoid
+  // fragmentation with small allocs in most common cases.
+  CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
+  CostInterval* free_intervals_;
+  // These are regularly malloc'd remains. This list can't grow larger than than
+  // size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
+  CostInterval* recycled_intervals_;
+} CostManager;
+
+static void CostIntervalAddToFreeList(CostManager* const manager,
+                                      CostInterval* const interval) {
+  interval->next_ = manager->free_intervals_;
+  manager->free_intervals_ = interval;
+}
+
+static int CostIntervalIsInFreeList(const CostManager* const manager,
+                                    const CostInterval* const interval) {
+  return (interval >= &manager->intervals_[0] &&
+          interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
+}
+
+static void CostManagerInitFreeList(CostManager* const manager) {
+  int i;
+  manager->free_intervals_ = NULL;
+  for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
+    CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
+  }
+}
+
+static void DeleteIntervalList(CostManager* const manager,
+                               const CostInterval* interval) {
+  while (interval != NULL) {
+    const CostInterval* const next = interval->next_;
+    if (!CostIntervalIsInFreeList(manager, interval)) {
+      WebPSafeFree((void*)interval);
+    }  // else: do nothing
+    interval = next;
+  }
+}
+
+static void CostManagerClear(CostManager* const manager) {
+  if (manager == NULL) return;
+
+  WebPSafeFree(manager->costs_);
+  WebPSafeFree(manager->cache_intervals_);
+
+  // Clear the interval lists.
+  DeleteIntervalList(manager, manager->head_);
+  manager->head_ = NULL;
+  DeleteIntervalList(manager, manager->recycled_intervals_);
+  manager->recycled_intervals_ = NULL;
+
+  // Reset pointers, count_ and cache_intervals_size_.
+  memset(manager, 0, sizeof(*manager));
+  CostManagerInitFreeList(manager);
+}
+
+static int CostManagerInit(CostManager* const manager,
+                           uint16_t* const dist_array, int pix_count,
+                           const CostModel* const cost_model) {
+  int i;
+  const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
+
+  manager->costs_ = NULL;
+  manager->cache_intervals_ = NULL;
+  manager->head_ = NULL;
+  manager->recycled_intervals_ = NULL;
+  manager->count_ = 0;
+  manager->dist_array_ = dist_array;
+  CostManagerInitFreeList(manager);
+
+  // Fill in the cost_cache_.
+  manager->cache_intervals_size_ = 1;
+  manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
+  for (i = 1; i < cost_cache_size; ++i) {
+    manager->cost_cache_[i] = GetLengthCost(cost_model, i);
+    // Get the number of bound intervals.
+    if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
+      ++manager->cache_intervals_size_;
+    }
+  }
+
+  // With the current cost model, we usually have below 20 intervals.
+  // The worst case scenario with a cost model would be if every length has a
+  // different cost, hence MAX_LENGTH but that is impossible with the current
+  // implementation that spirals around a pixel.
+  assert(manager->cache_intervals_size_ <= MAX_LENGTH);
+  manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
+      manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
+  if (manager->cache_intervals_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+
+  // Fill in the cache_intervals_.
+  {
+    CostCacheInterval* cur = manager->cache_intervals_;
+
+    // Consecutive values in cost_cache_ are compared and if a big enough
+    // difference is found, a new interval is created and bounded.
+    cur->start_ = 0;
+    cur->end_ = 1;
+    cur->cost_ = manager->cost_cache_[0];
+    for (i = 1; i < cost_cache_size; ++i) {
+      const double cost_val = manager->cost_cache_[i];
+      if (cost_val != cur->cost_) {
+        ++cur;
+        // Initialize an interval.
+        cur->start_ = i;
+        cur->cost_ = cost_val;
+      }
+      cur->end_ = i + 1;
+    }
+  }
+
+  manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
+  if (manager->costs_ == NULL) {
+    CostManagerClear(manager);
+    return 0;
+  }
+  // Set the initial costs_ high for every pixel as we will keep the minimum.
+  for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
+
+  return 1;
+}
+
+// Given the cost and the position that define an interval, update the cost at
+// pixel 'i' if it is smaller than the previously computed value.
+static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
+                                   int position, float cost) {
+  const int k = i - position;
+  assert(k >= 0 && k < MAX_LENGTH);
+
+  if (manager->costs_[i] > cost) {
+    manager->costs_[i] = cost;
+    manager->dist_array_[i] = k + 1;
+  }
+}
+
+// Given the cost and the position that define an interval, update the cost for
+// all the pixels between 'start' and 'end' excluded.
+static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
+                                              int start, int end, int position,
+                                              float cost) {
+  int i;
+  for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
+}
+
+// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
+static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
+                                         CostInterval* const prev,
+                                         CostInterval* const next) {
+  if (prev != NULL) {
+    prev->next_ = next;
+  } else {
+    manager->head_ = next;
+  }
+
+  if (next != NULL) next->previous_ = prev;
+}
+
+// Pop an interval in the manager.
+static WEBP_INLINE void PopInterval(CostManager* const manager,
+                                    CostInterval* const interval) {
+  if (interval == NULL) return;
+
+  ConnectIntervals(manager, interval->previous_, interval->next_);
+  if (CostIntervalIsInFreeList(manager, interval)) {
+    CostIntervalAddToFreeList(manager, interval);
+  } else {  // recycle regularly malloc'd intervals too
+    interval->next_ = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval;
+  }
+  --manager->count_;
+  assert(manager->count_ >= 0);
+}
+
+// Update the cost at index i by going over all the stored intervals that
+// overlap with i.
+// If 'do_clean_intervals' is set to something different than 0, intervals that
+// end before 'i' will be popped.
+static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
+                                          int do_clean_intervals) {
+  CostInterval* current = manager->head_;
+
+  while (current != NULL && current->start_ <= i) {
+    CostInterval* const next = current->next_;
+    if (current->end_ <= i) {
+      if (do_clean_intervals) {
+        // We have an outdated interval, remove it.
+        PopInterval(manager, current);
+      }
+    } else {
+      UpdateCost(manager, i, current->index_, current->cost_);
+    }
+    current = next;
+  }
+}
+
+// Given a current orphan interval and its previous interval, before
+// it was orphaned (which can be NULL), set it at the right place in the list
+// of intervals using the start_ ordering and the previous interval as a hint.
+static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
+                                               CostInterval* const current,
+                                               CostInterval* previous) {
+  assert(current != NULL);
+
+  if (previous == NULL) previous = manager->head_;
+  while (previous != NULL && current->start_ < previous->start_) {
+    previous = previous->previous_;
+  }
+  while (previous != NULL && previous->next_ != NULL &&
+         previous->next_->start_ < current->start_) {
+    previous = previous->next_;
+  }
+
+  if (previous != NULL) {
+    ConnectIntervals(manager, current, previous->next_);
+  } else {
+    ConnectIntervals(manager, current, manager->head_);
+  }
+  ConnectIntervals(manager, previous, current);
+}
+
+// Insert an interval in the list contained in the manager by starting at
+// interval_in as a hint. The intervals are sorted by start_ value.
+static WEBP_INLINE void InsertInterval(CostManager* const manager,
+                                       CostInterval* const interval_in,
+                                       float cost, int position, int start,
+                                       int end) {
+  CostInterval* interval_new;
+
+  if (start >= end) return;
+  if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
+    // Serialize the interval if we cannot store it.
+    UpdateCostPerInterval(manager, start, end, position, cost);
+    return;
+  }
+  if (manager->free_intervals_ != NULL) {
+    interval_new = manager->free_intervals_;
+    manager->free_intervals_ = interval_new->next_;
+  } else if (manager->recycled_intervals_ != NULL) {
+    interval_new = manager->recycled_intervals_;
+    manager->recycled_intervals_ = interval_new->next_;
+  } else {  // malloc for good
+    interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
+    if (interval_new == NULL) {
+      // Write down the interval if we cannot create it.
+      UpdateCostPerInterval(manager, start, end, position, cost);
+      return;
+    }
+  }
+
+  interval_new->cost_ = cost;
+  interval_new->index_ = position;
+  interval_new->start_ = start;
+  interval_new->end_ = end;
+  PositionOrphanInterval(manager, interval_new, interval_in);
+
+  ++manager->count_;
+}
+
+// Given a new cost interval defined by its start at position, its length value
+// and distance_cost, add its contributions to the previous intervals and costs.
+// If handling the interval or one of its subintervals becomes to heavy, its
+// contribution is added to the costs right away.
+static WEBP_INLINE void PushInterval(CostManager* const manager,
+                                     double distance_cost, int position,
+                                     int len) {
+  size_t i;
+  CostInterval* interval = manager->head_;
+  CostInterval* interval_next;
+  const CostCacheInterval* const cost_cache_intervals =
+      manager->cache_intervals_;
+  // If the interval is small enough, no need to deal with the heavy
+  // interval logic, just serialize it right away. This constant is empirical.
+  const int kSkipDistance = 10;
+
+  if (len < kSkipDistance) {
+    int j;
+    for (j = position; j < position + len; ++j) {
+      const int k = j - position;
+      float cost_tmp;
+      assert(k >= 0 && k < MAX_LENGTH);
+      cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
+
+      if (manager->costs_[j] > cost_tmp) {
+        manager->costs_[j] = cost_tmp;
+        manager->dist_array_[j] = k + 1;
+      }
+    }
+    return;
+  }
+
+  for (i = 0; i < manager->cache_intervals_size_ &&
+              cost_cache_intervals[i].start_ < len;
+       ++i) {
+    // Define the intersection of the ith interval with the new one.
+    int start = position + cost_cache_intervals[i].start_;
+    const int end = position + (cost_cache_intervals[i].end_ > len
+                                 ? len
+                                 : cost_cache_intervals[i].end_);
+    const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
+
+    for (; interval != NULL && interval->start_ < end;
+         interval = interval_next) {
+      interval_next = interval->next_;
+
+      // Make sure we have some overlap
+      if (start >= interval->end_) continue;
+
+      if (cost >= interval->cost_) {
+        // When intervals are represented, the lower, the better.
+        // [**********************************************************[
+        // start                                                    end
+        //                   [----------------------------------[
+        //                   interval->start_       interval->end_
+        // If we are worse than what we already have, add whatever we have so
+        // far up to interval.
+        const int start_new = interval->end_;
+        InsertInterval(manager, interval, cost, position, start,
+                       interval->start_);
+        start = start_new;
+        if (start >= end) break;
+        continue;
+      }
+
+      if (start <= interval->start_) {
+        if (interval->end_ <= end) {
+          //                   [----------------------------------[
+          //                   interval->start_       interval->end_
+          // [**************************************************************[
+          // start                                                        end
+          // We can safely remove the old interval as it is fully included.
+          PopInterval(manager, interval);
+        } else {
+          //              [------------------------------------[
+          //              interval->start_        interval->end_
+          // [*****************************[
+          // start                       end
+          interval->start_ = end;
+          break;
+        }
+      } else {
+        if (end < interval->end_) {
+          // [--------------------------------------------------------------[
+          // interval->start_                                  interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          // We have to split the old interval as it fully contains the new one.
+          const int end_original = interval->end_;
+          interval->end_ = start;
+          InsertInterval(manager, interval, interval->cost_, interval->index_,
+                         end, end_original);
+          interval = interval->next_;
+          break;
+        } else {
+          // [------------------------------------[
+          // interval->start_        interval->end_
+          //                     [*****************************[
+          //                     start                       end
+          interval->end_ = start;
+        }
+      }
+    }
+    // Insert the remaining interval from start to end.
+    InsertInterval(manager, interval, cost, position, start, end);
+  }
+}
+
+static int BackwardReferencesHashChainDistanceOnly(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
+    uint16_t* const dist_array) {
+  int i;
+  int ok = 0;
+  int cc_init = 0;
+  const int pix_count = xsize * ysize;
+  const int use_color_cache = (cache_bits > 0);
+  const size_t literal_array_size =
+      sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+                        ((cache_bits > 0) ? (1 << cache_bits) : 0));
+  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+  CostModel* const cost_model =
+      (CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
+  VP8LColorCache hashers;
+  CostManager* cost_manager =
+      (CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
+  int offset_prev = -1, len_prev = -1;
+  double offset_cost = -1;
+  int first_offset_is_constant = -1;  // initialized with 'impossible' value
+  int reach = 0;
+
+  if (cost_model == NULL || cost_manager == NULL) goto Error;
+
+  cost_model->literal_ = (double*)(cost_model + 1);
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
+    goto Error;
+  }
+
+  if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
+    goto Error;
+  }
+
+  // We loop one pixel at a time, but store all currently best points to
+  // non-processed locations from this point.
+  dist_array[0] = 0;
+  // Add first pixel as literal.
+  AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
+                                0.f, cost_manager->costs_, dist_array);
+
+  for (i = 1; i < pix_count; ++i) {
+    const float prev_cost = cost_manager->costs_[i - 1];
+    int offset, len;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+
+    // Try adding the pixel as a literal.
+    AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
+                                  use_color_cache, prev_cost,
+                                  cost_manager->costs_, dist_array);
+
+    // If we are dealing with a non-literal.
+    if (len >= 2) {
+      if (offset != offset_prev) {
+        const int code = VP8LDistanceToPlaneCode(xsize, offset);
+        offset_cost = GetDistanceCost(cost_model, code);
+        first_offset_is_constant = 1;
+        PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+      } else {
+        assert(offset_cost >= 0);
+        assert(len_prev >= 0);
+        assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
+        // Instead of considering all contributions from a pixel i by calling:
+        //         PushInterval(cost_manager, prev_cost + offset_cost, i, len);
+        // we optimize these contributions in case offset_cost stays the same
+        // for consecutive pixels. This describes a set of pixels similar to a
+        // previous set (e.g. constant color regions).
+        if (first_offset_is_constant) {
+          reach = i - 1 + len_prev - 1;
+          first_offset_is_constant = 0;
+        }
+
+        if (i + len - 1 > reach) {
+          // We can only be go further with the same offset if the previous
+          // length was maxed, hence len_prev == len == MAX_LENGTH.
+          // TODO(vrabaud), bump i to the end right away (insert cache and
+          // update cost).
+          // TODO(vrabaud), check if one of the points in between does not have
+          // a lower cost.
+          // Already consider the pixel at "reach" to add intervals that are
+          // better than whatever we add.
+          int offset_j, len_j = 0;
+          int j;
+          assert(len == MAX_LENGTH || len == pix_count - i);
+          // Figure out the last consecutive pixel within [i, reach + 1] with
+          // the same offset.
+          for (j = i; j <= reach; ++j) {
+            VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
+            if (offset_j != offset) {
+              VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
+              break;
+            }
+          }
+          // Update the cost at j - 1 and j.
+          UpdateCostAtIndex(cost_manager, j - 1, 0);
+          UpdateCostAtIndex(cost_manager, j, 0);
+
+          PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
+                       j, len_j);
+          reach = j + len_j - 1;
+        }
+      }
+    }
+
+    UpdateCostAtIndex(cost_manager, i, 1);
+    offset_prev = offset;
+    len_prev = len;
+  }
+
+  ok = !refs->error_;
+Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  CostManagerClear(cost_manager);
+  WebPSafeFree(cost_model);
+  WebPSafeFree(cost_manager);
+  return ok;
+}
+
+// We pack the path at the end of *dist_array and return
+// a pointer to this part of the array. Example:
+// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
+static void TraceBackwards(uint16_t* const dist_array,
+                           int dist_array_size,
+                           uint16_t** const chosen_path,
+                           int* const chosen_path_size) {
+  uint16_t* path = dist_array + dist_array_size;
+  uint16_t* cur = dist_array + dist_array_size - 1;
+  while (cur >= dist_array) {
+    const int k = *cur;
+    --path;
+    *path = k;
+    cur -= k;
+  }
+  *chosen_path = path;
+  *chosen_path_size = (int)(dist_array + dist_array_size - path);
+}
+
+static int BackwardReferencesHashChainFollowChosenPath(
+    const uint32_t* const argb, int cache_bits,
+    const uint16_t* const chosen_path, int chosen_path_size,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
+  const int use_color_cache = (cache_bits > 0);
+  int ix;
+  int i = 0;
+  int ok = 0;
+  int cc_init = 0;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+
+  VP8LClearBackwardRefs(refs);
+  for (ix = 0; ix < chosen_path_size; ++ix) {
+    const int len = chosen_path[ix];
+    if (len != 1) {
+      int k;
+      const int offset = VP8LHashChainFindOffset(hash_chain, i);
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (k = 0; k < len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += len;
+    } else {
+      PixOrCopy v;
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
+        // push pixel as a color cache index
+        v = PixOrCopyCreateCacheIdx(idx);
+      } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
+        v = PixOrCopyCreateLiteral(argb[i]);
+      }
+      VP8LBackwardRefsCursorAdd(refs, v);
+      ++i;
+    }
+  }
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Returns 1 on success.
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
+                                         const uint32_t* const argb,
+                                         int cache_bits,
+                                         const VP8LHashChain* const hash_chain,
+                                         const VP8LBackwardRefs* const refs_src,
+                                         VP8LBackwardRefs* const refs_dst) {
+  int ok = 0;
+  const int dist_array_size = xsize * ysize;
+  uint16_t* chosen_path = NULL;
+  int chosen_path_size = 0;
+  uint16_t* dist_array =
+      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+
+  if (dist_array == NULL) goto Error;
+
+  if (!BackwardReferencesHashChainDistanceOnly(
+          xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
+    goto Error;
+  }
+  TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
+  if (!BackwardReferencesHashChainFollowChosenPath(
+          argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
+          refs_dst)) {
+    goto Error;
+  }
+  ok = 1;
+ Error:
+  WebPSafeFree(dist_array);
+  return ok;
+}
diff --git a/thirdparty/libwebp/src/enc/backward_references_enc.c b/thirdparty/libwebp/src/enc/backward_references_enc.c
new file mode 100644
index 0000000000..39230188b9
--- /dev/null
+++ b/thirdparty/libwebp/src/enc/backward_references_enc.c
@@ -0,0 +1,943 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+//
+
+#include <assert.h>
+#include <math.h>
+
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
+
+#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
+
+#define MAX_ENTROPY    (1e30f)
+
+// 1M window (4M bytes) minus 120 special codes for short distances.
+#define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
+
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
+
+// -----------------------------------------------------------------------------
+
+static const uint8_t plane_to_code_lut[128] = {
+ 96,   73,  55,  39,  23,  13,   5,  1,  255, 255, 255, 255, 255, 255, 255, 255,
+ 101,  78,  58,  42,  26,  16,   8,  2,    0,   3,  9,   17,  27,  43,  59,  79,
+ 102,  86,  62,  46,  32,  20,  10,  6,    4,   7,  11,  21,  33,  47,  63,  87,
+ 105,  90,  70,  52,  37,  28,  18,  14,  12,  15,  19,  29,  38,  53,  71,  91,
+ 110,  99,  82,  66,  48,  35,  30,  24,  22,  25,  31,  36,  49,  67,  83, 100,
+ 115, 108,  94,  76,  64,  50,  44,  40,  34,  41,  45,  51,  65,  77,  95, 109,
+ 118, 113, 103,  92,  80,  68,  60,  56,  54,  57,  61,  69,  81,  93, 104, 114,
+ 119, 116, 111, 106,  97,  88,  84,  74,  72,  75,  85,  89,  98, 107, 112, 117
+};
+
+extern int VP8LDistanceToPlaneCode(int xsize, int dist);
+int VP8LDistanceToPlaneCode(int xsize, int dist) {
+  const int yoffset = dist / xsize;
+  const int xoffset = dist - yoffset * xsize;
+  if (xoffset <= 8 && yoffset < 8) {
+    return plane_to_code_lut[yoffset * 16 + 8 - xoffset] + 1;
+  } else if (xoffset > xsize - 8 && yoffset < 7) {
+    return plane_to_code_lut[(yoffset + 1) * 16 + 8 + (xsize - xoffset)] + 1;
+  }
+  return dist + 120;
+}
+
+// Returns the exact index where array1 and array2 are different. For an index
+// inferior or equal to best_len_match, the return value just has to be strictly
+// inferior to best_len_match. The current behavior is to return 0 if this index
+// is best_len_match, and the index itself otherwise.
+// If no two elements are the same, it returns max_limit.
+static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
+                                       const uint32_t* const array2,
+                                       int best_len_match, int max_limit) {
+  // Before 'expensive' linear match, check if the two arrays match at the
+  // current best length index.
+  if (array1[best_len_match] != array2[best_len_match]) return 0;
+
+  return VP8LVectorMismatch(array1, array2, max_limit);
+}
+
+// -----------------------------------------------------------------------------
+//  VP8LBackwardRefs
+
+struct PixOrCopyBlock {
+  PixOrCopyBlock* next_;   // next block (or NULL)
+  PixOrCopy* start_;       // data start
+  int size_;               // currently used size
+};
+
+extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  if (refs->tail_ != NULL) {
+    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
+  }
+  refs->free_blocks_ = refs->refs_;
+  refs->tail_ = &refs->refs_;
+  refs->last_block_ = NULL;
+  refs->refs_ = NULL;
+}
+
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  VP8LClearBackwardRefs(refs);
+  while (refs->free_blocks_ != NULL) {
+    PixOrCopyBlock* const next = refs->free_blocks_->next_;
+    WebPSafeFree(refs->free_blocks_);
+    refs->free_blocks_ = next;
+  }
+}
+
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
+  assert(refs != NULL);
+  memset(refs, 0, sizeof(*refs));
+  refs->tail_ = &refs->refs_;
+  refs->block_size_ =
+      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
+}
+
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c;
+  c.cur_block_ = refs->refs_;
+  if (refs->refs_ != NULL) {
+    c.cur_pos = c.cur_block_->start_;
+    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
+  } else {
+    c.cur_pos = NULL;
+    c.last_pos_ = NULL;
+  }
+  return c;
+}
+
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
+  PixOrCopyBlock* const b = c->cur_block_->next_;
+  c->cur_pos = (b == NULL) ? NULL : b->start_;
+  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
+  c->cur_block_ = b;
+}
+
+// Create a new block, either from the free list or allocated
+static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
+  PixOrCopyBlock* b = refs->free_blocks_;
+  if (b == NULL) {   // allocate new memory chunk
+    const size_t total_size =
+        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
+    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
+    if (b == NULL) {
+      refs->error_ |= 1;
+      return NULL;
+    }
+    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
+  } else {  // recycle from free-list
+    refs->free_blocks_ = b->next_;
+  }
+  *refs->tail_ = b;
+  refs->tail_ = &b->next_;
+  refs->last_block_ = b;
+  b->next_ = NULL;
+  b->size_ = 0;
+  return b;
+}
+
+extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                      const PixOrCopy v);
+void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                               const PixOrCopy v) {
+  PixOrCopyBlock* b = refs->last_block_;
+  if (b == NULL || b->size_ == refs->block_size_) {
+    b = BackwardRefsNewBlock(refs);
+    if (b == NULL) return;   // refs->error_ is set
+  }
+  b->start_[b->size_++] = v;
+}
+
+// -----------------------------------------------------------------------------
+// Hash chains
+
+int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+  assert(p->size_ == 0);
+  assert(p->offset_length_ == NULL);
+  assert(size > 0);
+  p->offset_length_ =
+      (uint32_t*)WebPSafeMalloc(size, sizeof(*p->offset_length_));
+  if (p->offset_length_ == NULL) return 0;
+  p->size_ = size;
+
+  return 1;
+}
+
+void VP8LHashChainClear(VP8LHashChain* const p) {
+  assert(p != NULL);
+  WebPSafeFree(p->offset_length_);
+
+  p->size_ = 0;
+  p->offset_length_ = NULL;
+}
+
+// -----------------------------------------------------------------------------
+
+#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
+#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
+
+static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
+  uint32_t key;
+  key  = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
+  key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
+  key = key >> (32 - HASH_BITS);
+  return key;
+}
+
+// Returns the maximum number of hash chain lookups to do for a
+// given compression quality. Return value in range [8, 86].
+static int GetMaxItersForQuality(int quality) {
+  return 8 + (quality * quality) / 128;
+}
+
+static int GetWindowSizeForHashChain(int quality, int xsize) {
+  const int max_window_size = (quality > 75) ? WINDOW_SIZE
+                            : (quality > 50) ? (xsize << 8)
+                            : (quality > 25) ? (xsize << 6)
+                            : (xsize << 4);
+  assert(xsize > 0);
+  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
+}
+
+static WEBP_INLINE int MaxFindCopyLength(int len) {
+  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
+}
+
+int VP8LHashChainFill(VP8LHashChain* const p, int quality,
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort) {
+  const int size = xsize * ysize;
+  const int iter_max = GetMaxItersForQuality(quality);
+  const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
+  int pos;
+  int argb_comp;
+  uint32_t base_position;
+  int32_t* hash_to_first_index;
+  // Temporarily use the p->offset_length_ as a hash chain.
+  int32_t* chain = (int32_t*)p->offset_length_;
+  assert(size > 0);
+  assert(p->size_ != 0);
+  assert(p->offset_length_ != NULL);
+
+  if (size <= 2) {
+    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+    return 1;
+  }
+
+  hash_to_first_index =
+      (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
+  if (hash_to_first_index == NULL) return 0;
+
+  // Set the int32_t array to -1.
+  memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
+  // Fill the chain linking pixels with the same hash.
+  argb_comp = (argb[0] == argb[1]);
+  for (pos = 0; pos < size - 2;) {
+    uint32_t hash_code;
+    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+    if (argb_comp && argb_comp_next) {
+      // Consecutive pixels with the same color will share the same hash.
+      // We therefore use a different hash: the color and its repetition
+      // length.
+      uint32_t tmp[2];
+      uint32_t len = 1;
+      tmp[0] = argb[pos];
+      // Figure out how far the pixels are the same.
+      // The last pixel has a different 64 bit hash, as its next pixel does
+      // not have the same color, so we just need to get to the last pixel equal
+      // to its follower.
+      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+        ++len;
+      }
+      if (len > MAX_LENGTH) {
+        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+        // because they are linked to their predecessor and we automatically
+        // check that in the main for loop below. Skipping means setting no
+        // predecessor in the chain, hence -1.
+        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+        pos += len - MAX_LENGTH;
+        len = MAX_LENGTH;
+      }
+      // Process the rest of the hash chain.
+      while (len) {
+        tmp[1] = len--;
+        hash_code = GetPixPairHash64(tmp);
+        chain[pos] = hash_to_first_index[hash_code];
+        hash_to_first_index[hash_code] = pos++;
+      }
+      argb_comp = 0;
+    } else {
+      // Just move one pixel forward.
+      hash_code = GetPixPairHash64(argb + pos);
+      chain[pos] = hash_to_first_index[hash_code];
+      hash_to_first_index[hash_code] = pos++;
+      argb_comp = argb_comp_next;
+    }
+  }
+  // Process the penultimate pixel.
+  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
+  WebPSafeFree(hash_to_first_index);
+
+  // Find the best match interval at each pixel, defined by an offset to the
+  // pixel and a length. The right-most pixel cannot match anything to the right
+  // (hence a best length of 0) and the left-most pixel nothing to the left
+  // (hence an offset of 0).
+  assert(size > 2);
+  p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+  for (base_position = size - 2; base_position > 0;) {
+    const int max_len = MaxFindCopyLength(size - 1 - base_position);
+    const uint32_t* const argb_start = argb + base_position;
+    int iter = iter_max;
+    int best_length = 0;
+    uint32_t best_distance = 0;
+    uint32_t best_argb;
+    const int min_pos =
+        (base_position > window_size) ? base_position - window_size : 0;
+    const int length_max = (max_len < 256) ? max_len : 256;
+    uint32_t max_base_position;
+
+    pos = chain[base_position];
+    if (!low_effort) {
+      int curr_length;
+      // Heuristic: use the comparison with the above line as an initialization.
+      if (base_position >= (uint32_t)xsize) {
+        curr_length = FindMatchLength(argb_start - xsize, argb_start,
+                                      best_length, max_len);
+        if (curr_length > best_length) {
+          best_length = curr_length;
+          best_distance = xsize;
+        }
+        --iter;
+      }
+      // Heuristic: compare to the previous pixel.
+      curr_length =
+          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+      if (curr_length > best_length) {
+        best_length = curr_length;
+        best_distance = 1;
+      }
+      --iter;
+      // Skip the for loop if we already have the maximum.
+      if (best_length == MAX_LENGTH) pos = min_pos - 1;
+    }
+    best_argb = argb_start[best_length];
+
+    for (; pos >= min_pos && --iter; pos = chain[pos]) {
+      int curr_length;
+      assert(base_position > (uint32_t)pos);
+
+      if (argb[pos + best_length] != best_argb) continue;
+
+      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
+      if (best_length < curr_length) {
+        best_length = curr_length;
+        best_distance = base_position - pos;
+        best_argb = argb_start[best_length];
+        // Stop if we have reached a good enough length.
+        if (best_length >= length_max) break;
+      }
+    }
+    // We have the best match but in case the two intervals continue matching
+    // to the left, we have the best matches for the left-extended pixels.
+    max_base_position = base_position;
+    while (1) {
+      assert(best_length <= MAX_LENGTH);
+      assert(best_distance <= WINDOW_SIZE);
+      p->offset_length_[base_position] =
+          (best_distance << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      --base_position;
+      // Stop if we don't have a match or if we are out of bounds.
+      if (best_distance == 0 || base_position == 0) break;
+      // Stop if we cannot extend the matching intervals to the left.
+      if (base_position < best_distance ||
+          argb[base_position - best_distance] != argb[base_position]) {
+        break;
+      }
+      // Stop if we are matching at its limit because there could be a closer
+      // matching interval with the same maximum length. Then again, if the
+      // matching interval is as close as possible (best_distance == 1), we will
+      // never find anything better so let's continue.
+      if (best_length == MAX_LENGTH && best_distance != 1 &&
+          base_position + MAX_LENGTH < max_base_position) {
+        break;
+      }
+      if (best_length < MAX_LENGTH) {
+        ++best_length;
+        max_base_position = base_position;
+      }
+    }
+  }
+  return 1;
+}
+
+static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
+                                         VP8LColorCache* const hashers,
+                                         VP8LBackwardRefs* const refs) {
+  PixOrCopy v;
+  if (use_color_cache) {
+    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
+    if (VP8LColorCacheLookup(hashers, key) == pixel) {
+      v = PixOrCopyCreateCacheIdx(key);
+    } else {
+      v = PixOrCopyCreateLiteral(pixel);
+      VP8LColorCacheSet(hashers, key, pixel);
+    }
+  } else {
+    v = PixOrCopyCreateLiteral(pixel);
+  }
+  VP8LBackwardRefsCursorAdd(refs, v);
+}
+
+static int BackwardReferencesRle(int xsize, int ysize,
+                                 const uint32_t* const argb,
+                                 int cache_bits, VP8LBackwardRefs* const refs) {
+  const int pix_count = xsize * ysize;
+  int i, k;
+  const int use_color_cache = (cache_bits > 0);
+  VP8LColorCache hashers;
+
+  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
+    return 0;
+  }
+  VP8LClearBackwardRefs(refs);
+  // Add first pixel as literal.
+  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
+  i = 1;
+  while (i < pix_count) {
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
+    const int prev_row_len = (i < xsize) ? 0 :
+        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
+    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
+      // We don't need to update the color cache here since it is always the
+      // same pixel being copied, and that does not change the color cache
+      // state.
+      i += rle_len;
+    } else if (prev_row_len >= MIN_LENGTH) {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
+      if (use_color_cache) {
+        for (k = 0; k < prev_row_len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += prev_row_len;
+    } else {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      i++;
+    }
+  }
+  if (use_color_cache) VP8LColorCacheClear(&hashers);
+  return !refs->error_;
+}
+
+static int BackwardReferencesLz77(int xsize, int ysize,
+                                  const uint32_t* const argb, int cache_bits,
+                                  const VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs) {
+  int i;
+  int i_last_check = -1;
+  int ok = 0;
+  int cc_init = 0;
+  const int use_color_cache = (cache_bits > 0);
+  const int pix_count = xsize * ysize;
+  VP8LColorCache hashers;
+
+  if (use_color_cache) {
+    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
+    if (!cc_init) goto Error;
+  }
+  VP8LClearBackwardRefs(refs);
+  for (i = 0; i < pix_count;) {
+    // Alternative#1: Code the pixels starting at 'i' using backward reference.
+    int offset = 0;
+    int len = 0;
+    int j;
+    VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
+    if (len >= MIN_LENGTH) {
+      const int len_ini = len;
+      int max_reach = 0;
+      const int j_max =
+          (i + len_ini >= pix_count) ? pix_count - 1 : i + len_ini;
+      // Only start from what we have not checked already.
+      i_last_check = (i > i_last_check) ? i : i_last_check;
+      // We know the best match for the current pixel but we try to find the
+      // best matches for the current pixel AND the next one combined.
+      // The naive method would use the intervals:
+      // [i,i+len) + [i+len, length of best match at i+len)
+      // while we check if we can use:
+      // [i,j) (where j<=i+len) + [j, length of best match at j)
+      for (j = i_last_check + 1; j <= j_max; ++j) {
+        const int len_j = VP8LHashChainFindLength(hash_chain, j);
+        const int reach =
+            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
+        if (reach > max_reach) {
+          len = j - i;
+          max_reach = reach;
+          if (max_reach >= pix_count) break;
+        }
+      }
+    } else {
+      len = 1;
+    }
+    // Go with literal or backward reference.
+    assert(len > 0);
+    if (len == 1) {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+    } else {
+      VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
+      if (use_color_cache) {
+        for (j = i; j < i + len; ++j) VP8LColorCacheInsert(&hashers, argb[j]);
+      }
+    }
+    i += len;
+  }
+
+  ok = !refs->error_;
+ Error:
+  if (cc_init) VP8LColorCacheClear(&hashers);
+  return ok;
+}
+
+// Compute an LZ77 by forcing matches to happen within a given distance cost.
+// We therefore limit the algorithm to the lowest 32 values in the PlaneCode
+// definition.
+#define WINDOW_OFFSETS_SIZE_MAX 32
+static int BackwardReferencesLz77Box(int xsize, int ysize,
+                                     const uint32_t* const argb, int cache_bits,
+                                     const VP8LHashChain* const hash_chain_best,
+                                     VP8LHashChain* hash_chain,
+                                     VP8LBackwardRefs* const refs) {
+  int i;
+  const int pix_count = xsize * ysize;
+  uint16_t* counts;
+  int window_offsets[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_new[WINDOW_OFFSETS_SIZE_MAX] = {0};
+  int window_offsets_size = 0;
+  int window_offsets_new_size = 0;
+  uint16_t* const counts_ini =
+      (uint16_t*)WebPSafeMalloc(xsize * ysize, sizeof(*counts_ini));
+  int best_offset_prev = -1, best_length_prev = -1;
+  if (counts_ini == NULL) return 0;
+
+  // counts[i] counts how many times a pixel is repeated starting at position i.
+  i = pix_count - 2;
+  counts = counts_ini + i;
+  counts[1] = 1;
+  for (; i >= 0; --i, --counts) {
+    if (argb[i] == argb[i + 1]) {
+      // Max out the counts to MAX_LENGTH.
+      counts[0] = counts[1] + (counts[1] != MAX_LENGTH);
+    } else {
+      counts[0] = 1;
+    }
+  }
+
+  // Figure out the window offsets around a pixel. They are stored in a
+  // spiraling order around the pixel as defined by VP8LDistanceToPlaneCode.
+  {
+    int x, y;
+    for (y = 0; y <= 6; ++y) {
+      for (x = -6; x <= 6; ++x) {
+        const int offset = y * xsize + x;
+        int plane_code;
+        // Ignore offsets that bring us after the pixel.
+        if (offset <= 0) continue;
+        plane_code = VP8LDistanceToPlaneCode(xsize, offset) - 1;
+        if (plane_code >= WINDOW_OFFSETS_SIZE_MAX) continue;
+        window_offsets[plane_code] = offset;
+      }
+    }
+    // For narrow images, not all plane codes are reached, so remove those.
+    for (i = 0; i < WINDOW_OFFSETS_SIZE_MAX; ++i) {
+      if (window_offsets[i] == 0) continue;
+      window_offsets[window_offsets_size++] = window_offsets[i];
+    }
+    // Given a pixel P, find the offsets that reach pixels unreachable from P-1
+    // with any of the offsets in window_offsets[].
+    for (i = 0; i < window_offsets_size; ++i) {
+      int j;
+      int is_reachable = 0;
+      for (j = 0; j < window_offsets_size && !is_reachable; ++j) {
+        is_reachable |= (window_offsets[i] == window_offsets[j] + 1);
+      }
+      if (!is_reachable) {
+        window_offsets_new[window_offsets_new_size] = window_offsets[i];
+        ++window_offsets_new_size;
+      }
+    }
+  }
+
+  hash_chain->offset_length_[0] = 0;
+  for (i = 1; i < pix_count; ++i) {
+    int ind;
+    int best_length = VP8LHashChainFindLength(hash_chain_best, i);
+    int best_offset;
+    int do_compute = 1;
+
+    if (best_length >= MAX_LENGTH) {
+      // Do not recompute the best match if we already have a maximal one in the
+      // window.
+      best_offset = VP8LHashChainFindOffset(hash_chain_best, i);
+      for (ind = 0; ind < window_offsets_size; ++ind) {
+        if (best_offset == window_offsets[ind]) {
+          do_compute = 0;
+          break;
+        }
+      }
+    }
+    if (do_compute) {
+      // Figure out if we should use the offset/length from the previous pixel
+      // as an initial guess and therefore only inspect the offsets in
+      // window_offsets_new[].
+      const int use_prev =
+          (best_length_prev > 1) && (best_length_prev < MAX_LENGTH);
+      const int num_ind =
+          use_prev ? window_offsets_new_size : window_offsets_size;
+      best_length = use_prev ? best_length_prev - 1 : 0;
+      best_offset = use_prev ? best_offset_prev : 0;
+      // Find the longest match in a window around the pixel.
+      for (ind = 0; ind < num_ind; ++ind) {
+        int curr_length = 0;
+        int j = i;
+        int j_offset =
+            use_prev ? i - window_offsets_new[ind] : i - window_offsets[ind];
+        if (j_offset < 0 || argb[j_offset] != argb[i]) continue;
+        // The longest match is the sum of how many times each pixel is
+        // repeated.
+        do {
+          const int counts_j_offset = counts_ini[j_offset];
+          const int counts_j = counts_ini[j];
+          if (counts_j_offset != counts_j) {
+            curr_length +=
+                (counts_j_offset < counts_j) ? counts_j_offset : counts_j;
+            break;
+          }
+          // The same color is repeated counts_pos times at j_offset and j.
+          curr_length += counts_j_offset;
+          j_offset += counts_j_offset;
+          j += counts_j_offset;
+        } while (curr_length <= MAX_LENGTH && j < pix_count &&
+                 argb[j_offset] == argb[j]);
+        if (best_length < curr_length) {
+          best_offset =
+              use_prev ? window_offsets_new[ind] : window_offsets[ind];
+          if (curr_length >= MAX_LENGTH) {
+            best_length = MAX_LENGTH;
+            break;
+          } else {
+            best_length = curr_length;
+          }
+        }
+      }
+    }
+
+    assert(i + best_length <= pix_count);
+    assert(best_length <= MAX_LENGTH);
+    if (best_length <= MIN_LENGTH) {
+      hash_chain->offset_length_[i] = 0;
+      best_offset_prev = 0;
+      best_length_prev = 0;
+    } else {
+      hash_chain->offset_length_[i] =
+          (best_offset << MAX_LENGTH_BITS) | (uint32_t)best_length;
+      best_offset_prev = best_offset;
+      best_length_prev = best_length;
+    }
+  }
+  hash_chain->offset_length_[0] = 0;
+  WebPSafeFree(counts_ini);
+
+  return BackwardReferencesLz77(xsize, ysize, argb, cache_bits, hash_chain,
+                                refs);
+}
+
+// -----------------------------------------------------------------------------
+
+static void BackwardReferences2DLocality(int xsize,
+                                         const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    if (PixOrCopyIsCopy(c.cur_pos)) {
+      const int dist = c.cur_pos->argb_or_distance;
+      const int transformed_dist = VP8LDistanceToPlaneCode(xsize, dist);
+      c.cur_pos->argb_or_distance = transformed_dist;
+    }
+    VP8LRefsCursorNext(&c);
+  }
+}
+
+// Evaluate optimal cache bits for the local color cache.
+// The input *best_cache_bits sets the maximum cache bits to use (passing 0
+// implies disabling the local color cache). The local color cache is also
+// disabled for the lower (<= 25) quality.
+// Returns 0 in case of memory error.
+static int CalculateBestCacheSize(const uint32_t* argb, int quality,
+                                  const VP8LBackwardRefs* const refs,
+                                  int* const best_cache_bits) {
+  int i;
+  const int cache_bits_max = (quality <= 25) ? 0 : *best_cache_bits;
+  double entropy_min = MAX_ENTROPY;
+  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+  int ok = 0;
+
+  assert(cache_bits_max >= 0 && cache_bits_max <= MAX_COLOR_CACHE_BITS);
+
+  if (cache_bits_max == 0) {
+    *best_cache_bits = 0;
+    // Local color cache is disabled.
+    return 1;
+  }
+
+  // Allocate data.
+  for (i = 0; i <= cache_bits_max; ++i) {
+    histos[i] = VP8LAllocateHistogram(i);
+    if (histos[i] == NULL) goto Error;
+    if (i == 0) continue;
+    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+    if (!cc_init[i]) goto Error;
+  }
+
+  // Find the cache_bits giving the lowest entropy. The search is done in a
+  // brute-force way as the function (entropy w.r.t cache_bits) can be
+  // anything in practice.
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t pix = *argb++;
+      const uint32_t a = (pix >> 24) & 0xff;
+      const uint32_t r = (pix >> 16) & 0xff;
+      const uint32_t g = (pix >>  8) & 0xff;
+      const uint32_t b = (pix >>  0) & 0xff;
+      // The keys of the caches can be derived from the longest one.
+      int key = VP8LHashPix(pix, 32 - cache_bits_max);
+      // Do not use the color cache for cache_bits = 0.
+      ++histos[0]->blue_[b];
+      ++histos[0]->literal_[g];
+      ++histos[0]->red_[r];
+      ++histos[0]->alpha_[a];
+      // Deal with cache_bits > 0.
+      for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+        if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+          ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+        } else {
+          VP8LColorCacheSet(&hashers[i], key, pix);
+          ++histos[i]->blue_[b];
+          ++histos[i]->literal_[g];
+          ++histos[i]->red_[r];
+          ++histos[i]->alpha_[a];
+        }
+      }
+    } else {
+      // We should compute the contribution of the (distance,length)
+      // histograms but those are the same independently from the cache size.
+      // As those constant contributions are in the end added to the other
+      // histogram contributions, we can safely ignore them.
+      int len = PixOrCopyLength(v);
+      uint32_t argb_prev = *argb ^ 0xffffffffu;
+      // Update the color caches.
+      do {
+        if (*argb != argb_prev) {
+          // Efficiency: insert only if the color changes.
+          int key = VP8LHashPix(*argb, 32 - cache_bits_max);
+          for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+            hashers[i].colors_[key] = *argb;
+          }
+          argb_prev = *argb;
+        }
+        argb++;
+      } while (--len != 0);
+    }
+    VP8LRefsCursorNext(&c);
+  }
+
+  for (i = 0; i <= cache_bits_max; ++i) {
+    const double entropy = VP8LHistogramEstimateBits(histos[i]);
+    if (i == 0 || entropy < entropy_min) {
+      entropy_min = entropy;
+      *best_cache_bits = i;
+    }
+  }
+  ok = 1;
+Error:
+  for (i = 0; i <= cache_bits_max; ++i) {
+    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+    VP8LFreeHistogram(histos[i]);
+  }
+  return ok;
+}
+
+// Update (in-place) backward references for specified cache_bits.
+static int BackwardRefsWithLocalCache(const uint32_t* const argb,
+                                      int cache_bits,
+                                      VP8LBackwardRefs* const refs) {
+  int pixel_index = 0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+
+  while (VP8LRefsCursorOk(&c)) {
+    PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t argb_literal = v->argb_or_distance;
+      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+      if (ix >= 0) {
+        // hashers contains argb_literal
+        *v = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        VP8LColorCacheInsert(&hashers, argb_literal);
+      }
+      ++pixel_index;
+    } else {
+      // refs was created without local cache, so it can not have cache indexes.
+      int k;
+      assert(PixOrCopyIsCopy(v));
+      for (k = 0; k < v->len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
+      }
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
+    int width, int height, const uint32_t* const argb,
+    int* const cache_bits, const VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs* const refs_lz77) {
+  *cache_bits = 0;
+  if (!BackwardReferencesLz77(width, height, argb, 0, hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  BackwardReferences2DLocality(width, refs_lz77);
+  return refs_lz77;
+}
+
+extern int VP8LBackwardReferencesTraceBackwards(
+    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
+    const VP8LHashChain* const hash_chain,
+    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
+static VP8LBackwardRefs* GetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int lz77_types_to_try, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best,
+    VP8LBackwardRefs* worst) {
+  const int cache_bits_initial = *cache_bits;
+  double bit_cost_best = -1;
+  VP8LHistogram* histo = NULL;
+  int lz77_type, lz77_type_best = 0;
+  VP8LHashChain hash_chain_box;
+  memset(&hash_chain_box, 0, sizeof(hash_chain_box));
+
+  histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
+  if (histo == NULL) goto Error;
+
+  for (lz77_type = 1; lz77_types_to_try;
+       lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
+    int res = 0;
+    double bit_cost;
+    int cache_bits_tmp = cache_bits_initial;
+    if ((lz77_types_to_try & lz77_type) == 0) continue;
+    switch (lz77_type) {
+      case kLZ77RLE:
+        res = BackwardReferencesRle(width, height, argb, 0, worst);
+        break;
+      case kLZ77Standard:
+        // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
+        // cache is not that different in practice.
+        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst);
+        break;
+      case kLZ77Box:
+        if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
+        res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
+                                        &hash_chain_box, worst);
+        break;
+      default:
+        assert(0);
+    }
+    if (!res) goto Error;
+
+    // Next, try with a color cache and update the references.
+    if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) {
+      goto Error;
+    }
+    if (cache_bits_tmp > 0) {
+      if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) {
+        goto Error;
+      }
+    }
+
+    // Keep the best backward references.
+    VP8LHistogramCreate(histo, worst, cache_bits_tmp);
+    bit_cost = VP8LHistogramEstimateBits(histo);
+    if (lz77_type_best == 0 || bit_cost < bit_cost_best) {
+      VP8LBackwardRefs* const tmp = worst;
+      worst = best;
+      best = tmp;
+      bit_cost_best = bit_cost;
+      *cache_bits = cache_bits_tmp;
+      lz77_type_best = lz77_type;
+    }
+  }
+  assert(lz77_type_best > 0);
+
+  // Improve on simple LZ77 but only for high quality (TraceBackwards is
+  // costly).
+  if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) &&
+      quality >= 25) {
+    const VP8LHashChain* const hash_chain_tmp =
+        (lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box;
+    if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits,
+                                             hash_chain_tmp, best, worst)) {
+      double bit_cost_trace;
+      VP8LHistogramCreate(histo, worst, *cache_bits);
+      bit_cost_trace = VP8LHistogramEstimateBits(histo);
+      if (bit_cost_trace < bit_cost_best) best = worst;
+    }
+  }
+
+  BackwardReferences2DLocality(width, best);
+
+Error:
+  VP8LHashChainClear(&hash_chain_box);
+  VP8LFreeHistogram(histo);
+  return best;
+}
+
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int low_effort, int lz77_types_to_try, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
+    VP8LBackwardRefs* const refs_tmp2) {
+  if (low_effort) {
+    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
+                                          hash_chain, refs_tmp1);
+  } else {
+    return GetBackwardReferences(width, height, argb, quality,
+                                 lz77_types_to_try, cache_bits, hash_chain,
+                                 refs_tmp1, refs_tmp2);
+  }
+}
diff --git a/thirdparty/libwebp/enc/backward_references_enc.h b/thirdparty/libwebp/src/enc/backward_references_enc.h
index 3a19aa763e..103ddfdcb7 100644
--- a/thirdparty/libwebp/enc/backward_references_enc.h
+++ b/thirdparty/libwebp/src/enc/backward_references_enc.h
@@ -10,13 +10,13 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 
-#ifndef WEBP_ENC_BACKWARD_REFERENCES_H_
-#define WEBP_ENC_BACKWARD_REFERENCES_H_
+#ifndef WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
+#define WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
 
 #include <assert.h>
 #include <stdlib.h>
-#include "../webp/types.h"
-#include "../webp/format_constants.h"
+#include "src/webp/types.h"
+#include "src/webp/format_constants.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -91,11 +91,6 @@ static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
   return p->len;
 }
 
-static WEBP_INLINE uint32_t PixOrCopyArgb(const PixOrCopy* const p) {
-  assert(p->mode == kLiteral);
-  return p->argb_or_distance;
-}
-
 static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
   assert(p->mode == kCacheIdx);
   assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS));
@@ -113,6 +108,16 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
 #define HASH_BITS 18
 #define HASH_SIZE (1 << HASH_BITS)
 
+// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
+// is used in VP8LHashChain.
+#define MAX_LENGTH_BITS 12
+#define WINDOW_SIZE_BITS 20
+// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
+#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
+#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
+#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
+#endif
+
 typedef struct VP8LHashChain VP8LHashChain;
 struct VP8LHashChain {
   // The 20 most significant bits contain the offset at which the best match
@@ -134,6 +139,24 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
                       int low_effort);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
+static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
+                                               const int base_position) {
+  return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
+}
+
+static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p,
+                                               const int base_position) {
+  return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
+}
+
+static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p,
+                                              int base_position,
+                                              int* const offset_ptr,
+                                              int* const length_ptr) {
+  *offset_ptr = VP8LHashChainFindOffset(p, base_position);
+  *length_ptr = VP8LHashChainFindLength(p, base_position);
+}
+
 // -----------------------------------------------------------------------------
 // VP8LBackwardRefs (block-based backward-references storage)
 
@@ -158,9 +181,6 @@ struct VP8LBackwardRefs {
 void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
 // Release memory for backward references.
 void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
-// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
-int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
-                         VP8LBackwardRefs* const dst);
 
 // Cursor for iterating on references content
 typedef struct {
@@ -189,6 +209,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // -----------------------------------------------------------------------------
 // Main entry points
 
+enum VP8LLZ77Type {
+  kLZ77Standard = 1,
+  kLZ77RLE = 2,
+  kLZ77Box = 4
+};
+
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
@@ -197,11 +223,12 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // refs[0] or refs[1].
 VP8LBackwardRefs* VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs refs[2]);
+    int low_effort, int lz77_types_to_try, int* const cache_bits,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
+    VP8LBackwardRefs* const refs_tmp2);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // WEBP_ENC_BACKWARD_REFERENCES_H_
+#endif  // WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
diff --git a/thirdparty/libwebp/enc/config_enc.c b/thirdparty/libwebp/src/enc/config_enc.c
index 4589dc0619..9d4828978e 100644
--- a/thirdparty/libwebp/enc/config_enc.c
+++ b/thirdparty/libwebp/src/enc/config_enc.c
@@ -12,10 +12,10 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../webp/encode.h"
+#include "src/webp/encode.h"
 
 //------------------------------------------------------------------------------
 // WebPConfig
diff --git a/thirdparty/libwebp/enc/cost_enc.c b/thirdparty/libwebp/src/enc/cost_enc.c
index c823f5a664..48fd9bc347 100644
--- a/thirdparty/libwebp/enc/cost_enc.c
+++ b/thirdparty/libwebp/src/enc/cost_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./cost_enc.h"
+#include "src/enc/cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Level cost tables
diff --git a/thirdparty/libwebp/enc/cost_enc.h b/thirdparty/libwebp/src/enc/cost_enc.h
index 99e4b37aa3..bdce1e6a3b 100644
--- a/thirdparty/libwebp/enc/cost_enc.h
+++ b/thirdparty/libwebp/src/enc/cost_enc.h
@@ -11,12 +11,12 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_ENC_COST_H_
-#define WEBP_ENC_COST_H_
+#ifndef WEBP_ENC_COST_ENC_H_
+#define WEBP_ENC_COST_ENC_H_
 
 #include <assert.h>
 #include <stdlib.h>
-#include "./vp8i_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -79,4 +79,4 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_COST_H_ */
+#endif  /* WEBP_ENC_COST_ENC_H_ */
diff --git a/thirdparty/libwebp/enc/delta_palettization_enc.c b/thirdparty/libwebp/src/enc/delta_palettization_enc.c
index eaf0f050ea..a61c8e6c93 100644
--- a/thirdparty/libwebp/enc/delta_palettization_enc.c
+++ b/thirdparty/libwebp/src/enc/delta_palettization_enc.c
@@ -10,11 +10,11 @@
 // Author: Mislav Bradac (mislavm@google.com)
 //
 
-#include "./delta_palettization_enc.h"
+#include "src/enc/delta_palettization_enc.h"
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../webp/types.h"
-#include "../dsp/lossless.h"
+#include "src/webp/types.h"
+#include "src/dsp/lossless.h"
 
 #define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
 
diff --git a/thirdparty/libwebp/enc/delta_palettization_enc.h b/thirdparty/libwebp/src/enc/delta_palettization_enc.h
index 63048ec6e8..b15e2cd487 100644
--- a/thirdparty/libwebp/enc/delta_palettization_enc.h
+++ b/thirdparty/libwebp/src/enc/delta_palettization_enc.h
@@ -10,11 +10,11 @@
 // Author: Mislav Bradac (mislavm@google.com)
 //
 
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_H_
+#ifndef WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
+#define WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
 
-#include "../webp/encode.h"
-#include "../enc/vp8li_enc.h"
+#include "src/webp/encode.h"
+#include "src/enc/vp8li_enc.h"
 
 // Replaces enc->argb_[] input by a palettizable approximation of it,
 // and generates optimal enc->palette_[].
@@ -22,4 +22,4 @@
 // if delta-palettization is not producing expected saving.
 WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
 
-#endif  // WEBP_ENC_DELTA_PALETTIZATION_H_
+#endif  // WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
diff --git a/thirdparty/libwebp/enc/filter_enc.c b/thirdparty/libwebp/src/enc/filter_enc.c
index 4bc367274c..580800bfb8 100644
--- a/thirdparty/libwebp/enc/filter_enc.c
+++ b/thirdparty/libwebp/src/enc/filter_enc.c
@@ -12,8 +12,8 @@
 // Author: somnath@google.com (Somnath Banerjee)
 
 #include <assert.h>
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
 
 // This table gives, for a given sharpness, the filtering strength to be
 // used (at least) in order to filter a given edge step delta.
@@ -65,6 +65,8 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta) {
 //------------------------------------------------------------------------------
 // Paragraph 15.4: compute the inner-edge filtering strength
 
+#if !defined(WEBP_REDUCE_SIZE)
+
 static int GetILevel(int sharpness, int level) {
   if (sharpness > 0) {
     if (sharpness > 4) {
@@ -129,11 +131,14 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   return sum;
 }
 
+#endif  // !defined(WEBP_REDUCE_SIZE)
+
 //------------------------------------------------------------------------------
 // Exposed APIs: Encoder should call the following 3 functions to adjust
 // loop filter strength
 
 void VP8InitFilter(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
   if (it->lf_stats_ != NULL) {
     int s, i;
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
@@ -143,9 +148,13 @@ void VP8InitFilter(VP8EncIterator* const it) {
     }
     VP8SSIMDspInit();
   }
+#else
+  (void)it;
+#endif
 }
 
 void VP8StoreFilterStats(VP8EncIterator* const it) {
+#if !defined(WEBP_REDUCE_SIZE)
   int d;
   VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
@@ -177,10 +186,14 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
     DoFilter(it, level);
     (*it->lf_stats_)[s][level] += GetMBSSIM(it->yuv_in_, it->yuv_out2_);
   }
+#else  // defined(WEBP_REDUCE_SIZE)
+  (void)it;
+#endif  // !defined(WEBP_REDUCE_SIZE)
 }
 
 void VP8AdjustFilterStrength(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
+#if !defined(WEBP_REDUCE_SIZE)
   if (it->lf_stats_ != NULL) {
     int s;
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
@@ -196,7 +209,10 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it) {
       }
       enc->dqm_[s].fstrength_ = best_level;
     }
-  } else if (enc->config_->filter_strength > 0) {
+    return;
+  }
+#endif  // !defined(WEBP_REDUCE_SIZE)
+  if (enc->config_->filter_strength > 0) {
     int max_level = 0;
     int s;
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
diff --git a/thirdparty/libwebp/enc/frame_enc.c b/thirdparty/libwebp/src/enc/frame_enc.c
index abef523bbf..2b0dc66410 100644
--- a/thirdparty/libwebp/enc/frame_enc.c
+++ b/thirdparty/libwebp/src/enc/frame_enc.c
@@ -14,10 +14,10 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost_enc.h"
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
-#include "../webp/format_constants.h"  // RIFF constants
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/webp/format_constants.h"  // RIFF constants
 
 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
@@ -200,11 +200,13 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
     const VP8MBInfo* const mb = &enc->mb_info_[n];
     p[mb->segment_]++;
   }
+#if !defined(WEBP_DISABLE_STATS)
   if (enc->pic_->stats != NULL) {
     for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
       enc->pic_->stats->segment_size[n] = p[n];
     }
   }
+#endif
   if (enc->segment_hdr_.num_segments_ > 1) {
     uint8_t* const probas = enc->proba_.segments_;
     probas[0] = GetProba(p[0] + p[1], p[2] + p[3]);
@@ -452,6 +454,8 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
 //------------------------------------------------------------------------------
 // ExtraInfo map / Debug function
 
+#if !defined(WEBP_DISABLE_STATS)
+
 #if SEGMENT_VISU
 static void SetBlock(uint8_t* p, int value, int size) {
   int y;
@@ -516,6 +520,20 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }
 
+#else  // defined(WEBP_DISABLE_STATS)
+static void ResetSSE(VP8Encoder* const enc) {
+  (void)enc;
+}
+static void StoreSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->extra_info != NULL) {
+    memset(pic->extra_info, 0,
+           enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+  }
+}
+#endif  // !defined(WEBP_DISABLE_STATS)
+
 static double GetPSNR(uint64_t mse, uint64_t size) {
   return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
 }
@@ -640,7 +658,7 @@ static int StatLoop(VP8Encoder* const enc) {
 // Main loops
 //
 
-static const int kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
+static const uint8_t kAverageBytesPerMB[8] = { 50, 24, 16, 9, 7, 5, 3, 2 };
 
 static int PreLoopInitialize(VP8Encoder* const enc) {
   int p;
@@ -670,6 +688,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
   }
 
   if (ok) {      // All good. Finish up.
+#if !defined(WEBP_DISABLE_STATS)
     if (enc->pic_->stats != NULL) {  // finalize byte counters...
       int i, s;
       for (i = 0; i <= 2; ++i) {
@@ -678,6 +697,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
         }
       }
     }
+#endif
     VP8AdjustFilterStrength(it);     // ...and store filter stats.
   } else {
     // Something bad happened -> need to do some memory cleanup.
diff --git a/thirdparty/libwebp/enc/histogram_enc.c b/thirdparty/libwebp/src/enc/histogram_enc.c
index 808b6f78ab..056a972dda 100644
--- a/thirdparty/libwebp/enc/histogram_enc.c
+++ b/thirdparty/libwebp/src/enc/histogram_enc.c
@@ -10,16 +10,16 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <math.h>
 
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../utils/utils.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/utils.h"
 
 #define MAX_COST 1.e38
 
@@ -76,7 +76,7 @@ void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                             VP8LHistogram* const histo) {
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
   while (VP8LRefsCursorOk(&c)) {
-    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, NULL, 0);
     VP8LRefsCursorNext(&c);
   }
 }
@@ -138,7 +138,9 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
 // -----------------------------------------------------------------------------
 
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v) {
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0) {
   if (PixOrCopyIsLiteral(v)) {
     ++histo->alpha_[PixOrCopyLiteral(v, 3)];
     ++histo->red_[PixOrCopyLiteral(v, 2)];
@@ -152,7 +154,13 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
     int code, extra_bits;
     VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
     ++histo->literal_[NUM_LITERAL_CODES + code];
-    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    if (distance_modifier == NULL) {
+      VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+    } else {
+      VP8LPrefixEncodeBits(
+          distance_modifier(distance_modifier_arg0, PixOrCopyDistance(v)),
+          &code, &extra_bits);
+    }
     ++histo->distance_[code];
   }
 }
@@ -473,7 +481,7 @@ static void HistogramBuild(
   while (VP8LRefsCursorOk(&c)) {
     const PixOrCopy* const v = c.cur_pos;
     const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
-    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
+    VP8LHistogramAddSinglePixOrCopy(histograms[ix], v, NULL, 0);
     x += PixOrCopyLength(v);
     while (x >= xsize) {
       x -= xsize;
@@ -523,11 +531,12 @@ static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
 
 // Compact image_histo[] by merging some histograms with same bin_id together if
 // it's advantageous.
-static VP8LHistogram* HistogramCombineEntropyBin(
-    VP8LHistogramSet* const image_histo,
-    VP8LHistogram* cur_combo,
-    const uint16_t* const bin_map, int bin_map_size, int num_bins,
-    double combine_cost_factor, int low_effort) {
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       VP8LHistogram* cur_combo,
+                                       const uint16_t* const bin_map,
+                                       int bin_map_size, int num_bins,
+                                       double combine_cost_factor,
+                                       int low_effort) {
   VP8LHistogram** const histograms = image_histo->histograms;
   int idx;
   // Work in-place: processed histograms are put at the beginning of
@@ -593,14 +602,13 @@ static VP8LHistogram* HistogramCombineEntropyBin(
       UpdateHistogramCost(histograms[idx]);
     }
   }
-  return cur_combo;
 }
 
+// Implement a Lehmer random number generator with a multiplicative constant of
+// 48271 and a modulo constant of 2^31 − 1.
 static uint32_t MyRand(uint32_t* const seed) {
-  *seed = (*seed * 16807ull) & 0xffffffffu;
-  if (*seed == 0) {
-    *seed = 1;
-  }
+  *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
+  assert(*seed > 0);
   return *seed;
 }
 
@@ -641,57 +649,75 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
 static void HistoQueueClear(HistoQueue* const histo_queue) {
   assert(histo_queue != NULL);
   WebPSafeFree(histo_queue->queue);
+  histo_queue->size = 0;
+  histo_queue->max_size = 0;
 }
 
-static void SwapHistogramPairs(HistogramPair *p1,
-                               HistogramPair *p2) {
-  const HistogramPair tmp = *p1;
-  *p1 = *p2;
-  *p2 = tmp;
+// Pop a specific pair in the queue by replacing it with the last one
+// and shrinking the queue.
+static void HistoQueuePopPair(HistoQueue* const histo_queue,
+                              HistogramPair* const pair) {
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  *pair = histo_queue->queue[histo_queue->size - 1];
+  --histo_queue->size;
 }
 
-// Given a valid priority queue in range [0, queue_size) this function checks
-// whether histo_queue[queue_size] should be accepted and swaps it with the
-// front if it is smaller. Otherwise, it leaves it as is.
-static void UpdateQueueFront(HistoQueue* const histo_queue) {
-  if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return;
-
-  if (histo_queue->queue[histo_queue->size].cost_diff <
-      histo_queue->queue[0].cost_diff) {
-    SwapHistogramPairs(histo_queue->queue,
-                       histo_queue->queue + histo_queue->size);
+// Check whether a pair in the queue should be updated as head or not.
+static void HistoQueueUpdateHead(HistoQueue* const histo_queue,
+                                 HistogramPair* const pair) {
+  assert(pair->cost_diff < 0.);
+  assert(pair >= histo_queue->queue &&
+         pair < (histo_queue->queue + histo_queue->size));
+  assert(histo_queue->size > 0);
+  if (pair->cost_diff < histo_queue->queue[0].cost_diff) {
+    // Replace the best pair.
+    const HistogramPair tmp = histo_queue->queue[0];
+    histo_queue->queue[0] = *pair;
+    *pair = tmp;
   }
-  ++histo_queue->size;
-
-  // We cannot add more elements than the capacity.
-  // The allocation adds an extra element to the official capacity so that
-  // histo_queue->queue[histo_queue->max_size] is read/written within bound.
-  assert(histo_queue->size <= histo_queue->max_size);
 }
 
-// -----------------------------------------------------------------------------
-
-static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
-                        HistogramPair* const pair) {
-  VP8LHistogram* h1;
-  VP8LHistogram* h2;
+// Create a pair from indices "idx1" and "idx2" provided its cost
+// is inferior to "threshold", a negative entropy.
+// It returns the cost of the pair, or 0. if it superior to threshold.
+static double HistoQueuePush(HistoQueue* const histo_queue,
+                             VP8LHistogram** const histograms, int idx1,
+                             int idx2, double threshold) {
+  const VP8LHistogram* h1;
+  const VP8LHistogram* h2;
+  HistogramPair pair;
   double sum_cost;
 
+  assert(threshold <= 0.);
   if (idx1 > idx2) {
     const int tmp = idx2;
     idx2 = idx1;
     idx1 = tmp;
   }
-  pair->idx1 = idx1;
-  pair->idx2 = idx2;
+  pair.idx1 = idx1;
+  pair.idx2 = idx2;
   h1 = histograms[idx1];
   h2 = histograms[idx2];
   sum_cost = h1->bit_cost_ + h2->bit_cost_;
-  pair->cost_combo = 0.;
-  GetCombinedHistogramEntropy(h1, h2, sum_cost, &pair->cost_combo);
-  pair->cost_diff = pair->cost_combo - sum_cost;
+  pair.cost_combo = 0.;
+  GetCombinedHistogramEntropy(h1, h2, sum_cost + threshold, &pair.cost_combo);
+  pair.cost_diff = pair.cost_combo - sum_cost;
+
+  // Do not even consider the pair if it does not improve the entropy.
+  if (pair.cost_diff >= threshold) return 0.;
+
+  // We cannot add more elements than the capacity.
+  assert(histo_queue->size < histo_queue->max_size);
+  histo_queue->queue[histo_queue->size++] = pair;
+  HistoQueueUpdateHead(histo_queue, &histo_queue->queue[histo_queue->size - 1]);
+
+  return pair.cost_diff;
 }
 
+// -----------------------------------------------------------------------------
+
 // Combines histograms by continuously choosing the one with the highest cost
 // reduction.
 static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
@@ -714,13 +740,11 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     clusters[i] = i;
     for (j = i + 1; j < image_histo_size; ++j) {
       // Initialize positions array.
-      PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size]);
-      UpdateQueueFront(&histo_queue);
+      HistoQueuePush(&histo_queue, histograms, i, j, 0.);
     }
   }
 
   while (image_histo_size > 1 && histo_queue.size > 0) {
-    HistogramPair* copy_to;
     const int idx1 = histo_queue.queue[0].idx1;
     const int idx2 = histo_queue.queue[0].idx2;
     HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
@@ -733,31 +757,22 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     }
     --image_histo_size;
 
-    // Remove pairs intersecting the just combined best pair. This will
-    // therefore pop the head of the queue.
-    copy_to = histo_queue.queue;
-    for (i = 0; i < histo_queue.size; ++i) {
+    // Remove pairs intersecting the just combined best pair.
+    for (i = 0; i < histo_queue.size;) {
       HistogramPair* const p = histo_queue.queue + i;
       if (p->idx1 == idx1 || p->idx2 == idx1 ||
           p->idx1 == idx2 || p->idx2 == idx2) {
-        // Do not copy the invalid pair.
-        continue;
-      }
-      if (p->cost_diff < histo_queue.queue[0].cost_diff) {
-        // Replace the top of the queue if we found better.
-        SwapHistogramPairs(histo_queue.queue, p);
+        HistoQueuePopPair(&histo_queue, p);
+      } else {
+        HistoQueueUpdateHead(&histo_queue, p);
+        ++i;
       }
-      SwapHistogramPairs(copy_to, p);
-      ++copy_to;
     }
-    histo_queue.size = (int)(copy_to - histo_queue.queue);
 
     // Push new pairs formed with combined histogram to the queue.
     for (i = 0; i < image_histo_size; ++i) {
       if (clusters[i] != idx1) {
-        PreparePair(histograms, idx1, clusters[i],
-                    &histo_queue.queue[histo_queue.size]);
-        UpdateQueueFront(&histo_queue);
+        HistoQueuePush(&histo_queue, histograms, idx1, clusters[i], 0.);
       }
     }
   }
@@ -777,90 +792,130 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
   return ok;
 }
 
-static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
-                                       VP8LHistogram* tmp_histo,
-                                       VP8LHistogram* best_combo,
-                                       int quality, int min_cluster_size) {
+// Perform histogram aggregation using a stochastic approach.
+// 'do_greedy' is set to 1 if a greedy approach needs to be performed
+// afterwards, 0 otherwise.
+static int HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
+                                      int min_cluster_size,
+                                      int* const do_greedy) {
   int iter;
-  uint32_t seed = 0;
+  uint32_t seed = 1;
   int tries_with_no_success = 0;
   int image_histo_size = image_histo->size;
-  const int iter_mult = (quality < 25) ? 2 : 2 + (quality - 25) / 8;
-  const int outer_iters = image_histo_size * iter_mult;
-  const int num_pairs = image_histo_size / 2;
+  const int outer_iters = image_histo_size;
   const int num_tries_no_success = outer_iters / 2;
-  int idx2_max = image_histo_size - 1;
-  int do_brute_dorce = 0;
   VP8LHistogram** const histograms = image_histo->histograms;
+  // Priority queue of histogram pairs. Its size of "kCostHeapSizeSqrt"^2
+  // impacts the quality of the compression and the speed: the smaller the
+  // faster but the worse for the compression.
+  HistoQueue histo_queue;
+  const int kHistoQueueSizeSqrt = 3;
+  int ok = 0;
 
+  if (!HistoQueueInit(&histo_queue, kHistoQueueSizeSqrt)) {
+    goto End;
+  }
   // Collapse similar histograms in 'image_histo'.
   ++min_cluster_size;
-  for (iter = 0;
-       iter < outer_iters && image_histo_size >= min_cluster_size;
+  for (iter = 0; iter < outer_iters && image_histo_size >= min_cluster_size &&
+                 ++tries_with_no_success < num_tries_no_success;
        ++iter) {
-    double best_cost_diff = 0.;
+    double best_cost =
+        (histo_queue.size == 0) ? 0. : histo_queue.queue[0].cost_diff;
     int best_idx1 = -1, best_idx2 = 1;
     int j;
-    int num_tries =
-        (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
-    // Use a brute force approach if:
-    // - stochastic has not worked for a while and
-    // - if the number of iterations for brute force is less than the number of
-    // iterations if we never find a match ever again stochastically (hence
-    // num_tries times the number of remaining outer iterations).
-    do_brute_dorce =
-        (tries_with_no_success > 10) &&
-        (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter));
-    if (do_brute_dorce) num_tries = idx2_max;
-
-    seed += iter;
-    for (j = 0; j < num_tries; ++j) {
-      double curr_cost_diff;
-      // Choose two histograms at random and try to combine them.
-      uint32_t idx1, idx2;
-      if (do_brute_dorce) {
-        // Use a brute force approach.
-        idx1 = (uint32_t)j;
-        idx2 = (uint32_t)idx2_max;
-      } else {
-        const uint32_t tmp = (j & 7) + 1;
-        const uint32_t diff =
-            (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
-        idx1 = MyRand(&seed) % image_histo_size;
-        idx2 = (idx1 + diff + 1) % image_histo_size;
-        if (idx1 == idx2) {
-          continue;
-        }
-      }
+    const uint32_t rand_range = (image_histo_size - 1) * image_histo_size;
+    // image_histo_size / 2 was chosen empirically. Less means faster but worse
+    // compression.
+    const int num_tries = image_histo_size / 2;
 
-      // Calculate cost reduction on combining.
-      curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
-                                        tmp_histo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {  // found a better pair?
-        HistogramSwap(&best_combo, &tmp_histo);
-        best_cost_diff = curr_cost_diff;
-        best_idx1 = idx1;
-        best_idx2 = idx2;
+    for (j = 0; j < num_tries; ++j) {
+      double curr_cost;
+      // Choose two different histograms at random and try to combine them.
+      const uint32_t tmp = MyRand(&seed) % rand_range;
+      const uint32_t idx1 = tmp / (image_histo_size - 1);
+      uint32_t idx2 = tmp % (image_histo_size - 1);
+      if (idx2 >= idx1) ++idx2;
+
+      // Calculate cost reduction on combination.
+      curr_cost =
+          HistoQueuePush(&histo_queue, histograms, idx1, idx2, best_cost);
+      if (curr_cost < 0) {  // found a better pair?
+        best_cost = curr_cost;
+        // Empty the queue if we reached full capacity.
+        if (histo_queue.size == histo_queue.max_size) break;
       }
     }
-    if (do_brute_dorce) --idx2_max;
-
-    if (best_idx1 >= 0) {
-      HistogramSwap(&best_combo, &histograms[best_idx1]);
-      // swap best_idx2 slot with last one (which is now unused)
-      --image_histo_size;
-      if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1;
-      if (best_idx2 != image_histo_size) {
-        HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
-        histograms[image_histo_size] = NULL;
-      }
-      tries_with_no_success = 0;
+    if (histo_queue.size == 0) continue;
+
+    // Merge the two best histograms.
+    best_idx1 = histo_queue.queue[0].idx1;
+    best_idx2 = histo_queue.queue[0].idx2;
+    assert(best_idx1 < best_idx2);
+    HistogramAddEval(histograms[best_idx1], histograms[best_idx2],
+                     histograms[best_idx1], 0);
+    // Swap the best_idx2 histogram with the last one (which is now unused).
+    --image_histo_size;
+    if (best_idx2 != image_histo_size) {
+      HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
     }
-    if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) {
-      break;
+    histograms[image_histo_size] = NULL;
+    // Parse the queue and update each pair that deals with best_idx1,
+    // best_idx2 or image_histo_size.
+    for (j = 0; j < histo_queue.size;) {
+      HistogramPair* const p = histo_queue.queue + j;
+      const int is_idx1_best = p->idx1 == best_idx1 || p->idx1 == best_idx2;
+      const int is_idx2_best = p->idx2 == best_idx1 || p->idx2 == best_idx2;
+      int do_eval = 0;
+      // The front pair could have been duplicated by a random pick so
+      // check for it all the time nevertheless.
+      if (is_idx1_best && is_idx2_best) {
+        HistoQueuePopPair(&histo_queue, p);
+        continue;
+      }
+      // Any pair containing one of the two best indices should only refer to
+      // best_idx1. Its cost should also be updated.
+      if (is_idx1_best) {
+        p->idx1 = best_idx1;
+        do_eval = 1;
+      } else if (is_idx2_best) {
+        p->idx2 = best_idx1;
+        do_eval = 1;
+      }
+      if (p->idx2 == image_histo_size) {
+        // No need to re-evaluate here as it does not involve a pair
+        // containing best_idx1 or best_idx2.
+        p->idx2 = best_idx2;
+      }
+      assert(p->idx2 < image_histo_size);
+      // Make sure the index order is respected.
+      if (p->idx1 > p->idx2) {
+        const int tmp = p->idx2;
+        p->idx2 = p->idx1;
+        p->idx1 = tmp;
+      }
+      if (do_eval) {
+        // Re-evaluate the cost of an updated pair.
+        GetCombinedHistogramEntropy(histograms[p->idx1], histograms[p->idx2], 0,
+                                    &p->cost_diff);
+        if (p->cost_diff >= 0.) {
+          HistoQueuePopPair(&histo_queue, p);
+          continue;
+        }
+      }
+      HistoQueueUpdateHead(&histo_queue, p);
+      ++j;
     }
+
+    tries_with_no_success = 0;
   }
   image_histo->size = image_histo_size;
+  *do_greedy = (image_histo->size <= min_cluster_size);
+  ok = 1;
+
+End:
+  HistoQueueClear(&histo_queue);
+  return ok;
 }
 
 // -----------------------------------------------------------------------------
@@ -925,7 +980,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              int quality, int low_effort,
                              int histo_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
-                             VP8LHistogramSet* const tmp_histos,
+                             VP8LHistogram* const tmp_histo,
                              uint16_t* const histogram_symbols) {
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
@@ -933,7 +988,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   const int image_histo_raw_size = histo_xsize * histo_ysize;
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
-  VP8LHistogram* cur_combo;
   // Don't attempt linear bin-partition heuristic for
   // histograms of small sizes (as bin_map will be very sparse) and
   // maximum quality q==100 (to preserve the compression gains at that level).
@@ -948,7 +1002,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // Copies the histograms and computes its bit_cost.
   HistogramCopyAndAnalyze(orig_histo, image_histo);
 
-  cur_combo = tmp_histos->histograms[1];  // pick up working slot
   if (entropy_combine) {
     const int bin_map_size = orig_histo->size;
     // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
@@ -958,10 +1011,9 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
     HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo,
-                                           bin_map, bin_map_size,
-                                           entropy_combine_num_bins,
-                                           combine_cost_factor, low_effort);
+    HistogramCombineEntropyBin(image_histo, tmp_histo, bin_map, bin_map_size,
+                               entropy_combine_num_bins, combine_cost_factor,
+                               low_effort);
   }
 
   // Don't combine the histograms using stochastic and greedy heuristics for
@@ -970,10 +1022,11 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     const float x = quality / 100.f;
     // cubic ramp between 1 and MAX_HISTO_GREEDY:
     const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
-    HistogramCombineStochastic(image_histo, tmp_histos->histograms[0],
-                               cur_combo, quality, threshold_size);
-    if ((image_histo->size <= threshold_size) &&
-        !HistogramCombineGreedy(image_histo)) {
+    int do_greedy;
+    if (!HistogramCombineStochastic(image_histo, threshold_size, &do_greedy)) {
+      goto Error;
+    }
+    if (do_greedy && !HistogramCombineGreedy(image_histo)) {
       goto Error;
     }
   }
diff --git a/thirdparty/libwebp/enc/histogram_enc.h b/thirdparty/libwebp/src/enc/histogram_enc.h
index a9d258a166..15b1fbda34 100644
--- a/thirdparty/libwebp/enc/histogram_enc.h
+++ b/thirdparty/libwebp/src/enc/histogram_enc.h
@@ -11,14 +11,14 @@
 //
 // Models the histograms of literal and distance codes.
 
-#ifndef WEBP_ENC_HISTOGRAM_H_
-#define WEBP_ENC_HISTOGRAM_H_
+#ifndef WEBP_ENC_HISTOGRAM_ENC_H_
+#define WEBP_ENC_HISTOGRAM_ENC_H_
 
 #include <string.h>
 
-#include "./backward_references_enc.h"
-#include "../webp/format_constants.h"
-#include "../webp/types.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -90,7 +90,9 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits);
 
 // Accumulate a token 'v' into a histogram.
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
-                                     const PixOrCopy* const v);
+                                     const PixOrCopy* const v,
+                                     int (*const distance_modifier)(int, int),
+                                     int distance_modifier_arg0);
 
 static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
   return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
@@ -103,7 +105,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              int quality, int low_effort,
                              int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_in,
-                             VP8LHistogramSet* const tmp_histos,
+                             VP8LHistogram* const tmp_histo,
                              uint16_t* const histogram_symbols);
 
 // Returns the entropy for the symbols in the input array.
@@ -120,4 +122,4 @@ double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
 }
 #endif
 
-#endif  // WEBP_ENC_HISTOGRAM_H_
+#endif  // WEBP_ENC_HISTOGRAM_ENC_H_
diff --git a/thirdparty/libwebp/enc/iterator_enc.c b/thirdparty/libwebp/src/enc/iterator_enc.c
index e48d30bd31..cfacfd2401 100644
--- a/thirdparty/libwebp/enc/iterator_enc.c
+++ b/thirdparty/libwebp/src/enc/iterator_enc.c
@@ -13,7 +13,7 @@
 
 #include <string.h>
 
-#include "./vp8i_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // VP8Iterator
diff --git a/thirdparty/libwebp/enc/near_lossless_enc.c b/thirdparty/libwebp/src/enc/near_lossless_enc.c
index 2bd03ab20d..cadd14c664 100644
--- a/thirdparty/libwebp/enc/near_lossless_enc.c
+++ b/thirdparty/libwebp/src/enc/near_lossless_enc.c
@@ -17,18 +17,20 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "../dsp/lossless_common.h"
-#include "../utils/utils.h"
-#include "./vp8i_enc.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/utils.h"
+#include "src/enc/vp8li_enc.h"
+
+#if (WEBP_NEAR_LOSSLESS == 1)
 
 #define MIN_DIM_FOR_NEAR_LOSSLESS 64
 #define MAX_LIMIT_BITS             5
 
 // Quantizes the value up or down to a multiple of 1<<bits (or to 255),
 // choosing the closer one, resolving ties using bankers' rounding.
-static int FindClosestDiscretized(int a, int bits) {
-  const int mask = (1 << bits) - 1;
-  const int biased = a + (mask >> 1) + ((a >> bits) & 1);
+static uint32_t FindClosestDiscretized(uint32_t a, int bits) {
+  const uint32_t mask = (1u << bits) - 1;
+  const uint32_t biased = a + (mask >> 1) + ((a >> bits) & 1);
   assert(bits > 0);
   if (biased > 0xff) return 0xff;
   return biased & ~mask;
@@ -69,22 +71,30 @@ static int IsSmooth(const uint32_t* const prev_row,
 }
 
 // Adjusts pixel values of image with given maximum error.
-static void NearLossless(int xsize, int ysize, uint32_t* argb,
-                         int limit_bits, uint32_t* copy_buffer) {
+static void NearLossless(int xsize, int ysize, const uint32_t* argb_src,
+                         int stride, int limit_bits, uint32_t* copy_buffer,
+                         uint32_t* argb_dst) {
   int x, y;
   const int limit = 1 << limit_bits;
   uint32_t* prev_row = copy_buffer;
   uint32_t* curr_row = prev_row + xsize;
   uint32_t* next_row = curr_row + xsize;
-  memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0]));
+  memcpy(curr_row, argb_src, xsize * sizeof(argb_src[0]));
+  memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
 
-  for (y = 1; y < ysize - 1; ++y) {
-    uint32_t* const curr_argb_row = argb + y * xsize;
-    uint32_t* const next_argb_row = curr_argb_row + xsize;
-    memcpy(next_row, next_argb_row, xsize * sizeof(argb[0]));
-    for (x = 1; x < xsize - 1; ++x) {
-      if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) {
-        curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+  for (y = 0; y < ysize; ++y, argb_src += stride, argb_dst += xsize) {
+    if (y == 0 || y == ysize - 1) {
+      memcpy(argb_dst, argb_src, xsize * sizeof(argb_src[0]));
+    } else {
+      memcpy(next_row, argb_src + stride, xsize * sizeof(argb_src[0]));
+      argb_dst[0] = argb_src[0];
+      argb_dst[xsize - 1] = argb_src[xsize - 1];
+      for (x = 1; x < xsize - 1; ++x) {
+        if (IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+          argb_dst[x] = curr_row[x];
+        } else {
+          argb_dst[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+        }
       }
     }
     {
@@ -97,26 +107,45 @@ static void NearLossless(int xsize, int ysize, uint32_t* argb,
   }
 }
 
-int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst) {
   int i;
+  const int xsize = picture->width;
+  const int ysize = picture->height;
+  const int stride = picture->argb_stride;
   uint32_t* const copy_buffer =
       (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
   const int limit_bits = VP8LNearLosslessBits(quality);
-  assert(argb != NULL);
-  assert(limit_bits >= 0);
+  assert(argb_dst != NULL);
+  assert(limit_bits > 0);
   assert(limit_bits <= MAX_LIMIT_BITS);
   if (copy_buffer == NULL) {
     return 0;
   }
   // For small icon images, don't attempt to apply near-lossless compression.
-  if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) {
+  if ((xsize < MIN_DIM_FOR_NEAR_LOSSLESS &&
+       ysize < MIN_DIM_FOR_NEAR_LOSSLESS) ||
+      ysize < 3) {
+    for (i = 0; i < ysize; ++i) {
+      memcpy(argb_dst + i * xsize, picture->argb + i * picture->argb_stride,
+             xsize * sizeof(*argb_dst));
+    }
     WebPSafeFree(copy_buffer);
     return 1;
   }
 
-  for (i = limit_bits; i != 0; --i) {
-    NearLossless(xsize, ysize, argb, i, copy_buffer);
+  NearLossless(xsize, ysize, picture->argb, stride, limit_bits, copy_buffer,
+               argb_dst);
+  for (i = limit_bits - 1; i != 0; --i) {
+    NearLossless(xsize, ysize, argb_dst, xsize, i, copy_buffer, argb_dst);
   }
   WebPSafeFree(copy_buffer);
   return 1;
 }
+#else  // (WEBP_NEAR_LOSSLESS == 1)
+
+// Define a stub to suppress compiler warnings.
+extern void VP8LNearLosslessStub(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8LNearLosslessStub(void) {}
+
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
diff --git a/thirdparty/libwebp/enc/picture_csp_enc.c b/thirdparty/libwebp/src/enc/picture_csp_enc.c
index e5d1c75a66..d531dd0282 100644
--- a/thirdparty/libwebp/enc/picture_csp_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_csp_enc.c
@@ -15,10 +15,12 @@
 #include <stdlib.h>
 #include <math.h>
 
-#include "./vp8i_enc.h"
-#include "../utils/random_utils.h"
-#include "../utils/utils.h"
-#include "../dsp/yuv.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/random_utils.h"
+#include "src/utils/utils.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/yuv.h"
 
 // Uncomment to disable gamma-compression during RGB->U/V averaging
 #define USE_GAMMA_COMPRESSION
@@ -39,12 +41,15 @@ static const union {
 static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
                           int x_step, int y_step) {
   if (alpha == NULL) return 0;
-  while (height-- > 0) {
-    int x;
-    for (x = 0; x < width * x_step; x += x_step) {
-      if (alpha[x] != 0xff) return 1;  // TODO(skal): check 4/8 bytes at a time.
+  WebPInitAlphaProcessing();
+  if (x_step == 1) {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha8b(alpha, width)) return 1;
+    }
+  } else {
+    for (; height-- > 0; alpha += y_step) {
+      if (WebPHasAlpha32b(alpha, width)) return 1;
     }
-    alpha += y_step;
   }
   return 0;
 }
@@ -56,15 +61,10 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
     return CheckNonOpaque(picture->a, picture->width, picture->height,
                           1, picture->a_stride);
   } else {
-    int x, y;
-    const uint32_t* argb = picture->argb;
-    if (argb == NULL) return 0;
-    for (y = 0; y < picture->height; ++y) {
-      for (x = 0; x < picture->width; ++x) {
-        if (argb[x] < 0xff000000u) return 1;   // test any alpha values != 0xff
-      }
-      argb += picture->argb_stride;
-    }
+    const int alpha_offset = ALPHA_IS_LAST ? 3 : 0;
+    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
+                          picture->width, picture->height,
+                          4, picture->argb_stride * sizeof(*picture->argb));
   }
   return 0;
 }
@@ -171,7 +171,7 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 #if defined(USE_GAMMA_COMPRESSION)
 
 // float variant of gamma-correction
-// We use tables of different size and precision for the Rec709
+// We use tables of different size and precision for the Rec709 / BT2020
 // transfer function.
 #define kGammaF (1./0.45)
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
@@ -183,8 +183,8 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
     int v;
     const double norm = 1. / MAX_Y_T;
     const double scale = 1. / kGammaTabSize;
-    const double a = 0.099;
-    const double thresh = 0.018;
+    const double a = 0.09929682680944;
+    const double thresh = 0.018053968510807;
     for (v = 0; v <= MAX_Y_T; ++v) {
       const double g = norm * v;
       if (g <= thresh * 4.5) {
@@ -856,7 +856,6 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
     return 0;
   }
   if (has_alpha) {
-    WebPInitAlphaProcessing();
     assert(step == 4);
 #if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
     assert(kAlphaFix + kGammaFix <= 31);
@@ -1085,40 +1084,45 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
 // automatic import / conversion
 
 static int Import(WebPPicture* const picture,
-                  const uint8_t* const rgb, int rgb_stride,
+                  const uint8_t* rgb, int rgb_stride,
                   int step, int swap_rb, int import_alpha) {
   int y;
   const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
   const uint8_t* g_ptr = rgb + 1;
   const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
-  const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
   const int width = picture->width;
   const int height = picture->height;
 
   if (!picture->use_argb) {
+    const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
                               0.f /* no dithering */, 0, picture);
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
-  VP8EncDspARGBInit();
+  VP8LDspInit();
+  WebPInitAlphaProcessing();
 
   if (import_alpha) {
     uint32_t* dst = picture->argb;
+    const int do_copy =
+        (!swap_rb && !ALPHA_IS_LAST) || (swap_rb && ALPHA_IS_LAST);
     assert(step == 4);
     for (y = 0; y < height; ++y) {
-      VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
-      a_ptr += rgb_stride;
-      r_ptr += rgb_stride;
-      g_ptr += rgb_stride;
-      b_ptr += rgb_stride;
+      if (do_copy) {
+        memcpy(dst, rgb, width * 4);
+      } else {
+        // RGBA input order. Need to swap R and B.
+        VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+      }
+      rgb += rgb_stride;
       dst += picture->argb_stride;
     }
   } else {
     uint32_t* dst = picture->argb;
     assert(step >= 3);
     for (y = 0; y < height; ++y) {
-      VP8PackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+      WebPPackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
       r_ptr += rgb_stride;
       g_ptr += rgb_stride;
       b_ptr += rgb_stride;
@@ -1130,12 +1134,7 @@ static int Import(WebPPicture* const picture,
 
 // Public API
 
-int WebPPictureImportRGB(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return (picture != NULL && rgb != NULL)
-             ? Import(picture, rgb, rgb_stride, 3, 0, 0)
-             : 0;
-}
+#if !defined(WEBP_REDUCE_CSP)
 
 int WebPPictureImportBGR(WebPPicture* picture,
                          const uint8_t* rgb, int rgb_stride) {
@@ -1144,31 +1143,41 @@ int WebPPictureImportBGR(WebPPicture* picture,
              : 0;
 }
 
-int WebPPictureImportRGBA(WebPPicture* picture,
+int WebPPictureImportBGRA(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 0, 1)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
              : 0;
 }
 
-int WebPPictureImportBGRA(WebPPicture* picture,
+
+int WebPPictureImportBGRX(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 1)
+             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
              : 0;
 }
 
-int WebPPictureImportRGBX(WebPPicture* picture,
+#endif   // WEBP_REDUCE_CSP
+
+int WebPPictureImportRGB(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL && rgb != NULL)
+             ? Import(picture, rgb, rgb_stride, 3, 0, 0)
+             : 0;
+}
+
+int WebPPictureImportRGBA(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 1)
              : 0;
 }
 
-int WebPPictureImportBGRX(WebPPicture* picture,
+int WebPPictureImportRGBX(WebPPicture* picture,
                           const uint8_t* rgba, int rgba_stride) {
   return (picture != NULL && rgba != NULL)
-             ? Import(picture, rgba, rgba_stride, 4, 1, 0)
+             ? Import(picture, rgba, rgba_stride, 4, 0, 0)
              : 0;
 }
 
diff --git a/thirdparty/libwebp/enc/picture_enc.c b/thirdparty/libwebp/src/enc/picture_enc.c
index dfa66510fb..c691622d03 100644
--- a/thirdparty/libwebp/enc/picture_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_enc.c
@@ -14,9 +14,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../dsp/dsp.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // WebPPicture
@@ -76,13 +76,12 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
   // allocate a new buffer.
-  memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
+  memory = WebPSafeMalloc(argb_size + WEBP_ALIGN_CST, sizeof(*picture->argb));
   if (memory == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  // TODO(skal): align plane to cache line?
   picture->memory_argb_ = memory;
-  picture->argb = (uint32_t*)memory;
+  picture->argb = (uint32_t*)WEBP_ALIGN(memory);
   picture->argb_stride = width;
   return 1;
 }
@@ -92,8 +91,8 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
       (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
   const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
   const int y_stride = width;
-  const int uv_width = (width + 1) >> 1;
-  const int uv_height = (height + 1) >> 1;
+  const int uv_width = (int)(((int64_t)width + 1) >> 1);
+  const int uv_height = (int)(((int64_t)height + 1) >> 1);
   const int uv_stride = uv_width;
   int a_width, a_stride;
   uint64_t y_size, uv_size, a_size, total_size;
@@ -118,8 +117,8 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
   total_size = y_size + a_size + 2 * uv_size;
 
   // Security and validation checks
-  if (width <= 0 || height <= 0 ||         // luma/alpha param error
-      uv_width < 0 || uv_height < 0) {     // u/v param error
+  if (width <= 0 || height <= 0 ||           // luma/alpha param error
+      uv_width <= 0 || uv_height <= 0) {     // u/v param error
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
   // allocate a new buffer.
@@ -271,9 +270,11 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
 }
 
 ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
-ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
 ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
 ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
 
 #undef ENCODE_FUNC
 
@@ -284,9 +285,11 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
 }
 
 LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
 LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
+#if !defined(WEBP_REDUCE_CSP)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
 LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
+#endif  // WEBP_REDUCE_CSP
 
 #undef LOSSLESS_ENCODE_FUNC
 
diff --git a/thirdparty/libwebp/enc/picture_psnr_enc.c b/thirdparty/libwebp/src/enc/picture_psnr_enc.c
index 9c0b229507..362a7c79be 100644
--- a/thirdparty/libwebp/enc/picture_psnr_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_psnr_enc.c
@@ -11,11 +11,15 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include "src/webp/encode.h"
+
+#if !(defined(WEBP_DISABLE_STATS) || defined(WEBP_REDUCE_SIZE))
+
 #include <math.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"
 
 typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
                                  const uint8_t* ref, int ref_stride,
@@ -210,4 +214,34 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
   return ok;
 }
 
-//------------------------------------------------------------------------------
+#else  // defined(WEBP_DISABLE_STATS)
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  (void)src;
+  (void)src_stride;
+  (void)ref;
+  (void)ref_stride;
+  (void)width;
+  (void)height;
+  (void)x_step;
+  (void)type;
+  if (distortion == NULL || result == NULL) return 0;
+  *distortion = 0.f;
+  *result = 0.f;
+  return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int i;
+  (void)src;
+  (void)ref;
+  (void)type;
+  if (results == NULL) return 0;
+  for (i = 0; i < 5; ++i) results[i] = 0.f;
+  return 1;
+}
+
+#endif  // !defined(WEBP_DISABLE_STATS)
diff --git a/thirdparty/libwebp/enc/picture_rescale_enc.c b/thirdparty/libwebp/src/enc/picture_rescale_enc.c
index 0b7181c0d7..58a6ae7b9d 100644
--- a/thirdparty/libwebp/enc/picture_rescale_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_rescale_enc.c
@@ -11,12 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include "src/webp/encode.h"
+
+#if !defined(WEBP_REDUCE_SIZE)
+
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8i_enc.h"
-#include "../utils/rescaler_utils.h"
-#include "../utils/utils.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/rescaler_utils.h"
+#include "src/utils/utils.h"
 
 #define HALVE(x) (((x) + 1) >> 1)
 
@@ -261,4 +265,45 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   return 1;
 }
 
-//------------------------------------------------------------------------------
+#else  // defined(WEBP_REDUCE_SIZE)
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  (void)src;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  (void)picture;
+  return 0;
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  (void)src;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  (void)dst;
+  return 0;
+}
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  (void)pic;
+  (void)left;
+  (void)top;
+  (void)width;
+  (void)height;
+  return 0;
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  (void)pic;
+  (void)width;
+  (void)height;
+  return 0;
+}
+#endif  // !defined(WEBP_REDUCE_SIZE)
diff --git a/thirdparty/libwebp/enc/picture_tools_enc.c b/thirdparty/libwebp/src/enc/picture_tools_enc.c
index 895df51156..be292d4391 100644
--- a/thirdparty/libwebp/enc/picture_tools_enc.c
+++ b/thirdparty/libwebp/src/enc/picture_tools_enc.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 
-#include "./vp8i_enc.h"
-#include "../dsp/yuv.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/dsp/yuv.h"
 
 static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
   return (0xff000000u | (r << 16) | (g << 8) | b);
@@ -25,20 +25,7 @@ static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
 
 #define SIZE 8
 #define SIZE2 (SIZE / 2)
-static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
-  int y, x;
-  for (y = 0; y < size; ++y) {
-    for (x = 0; x < size; ++x) {
-      if (ptr[x]) {
-        return 0;
-      }
-    }
-    ptr += stride;
-  }
-  return 1;
-}
-
-static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) {
+static int IsTransparentARGBArea(const uint32_t* ptr, int stride, int size) {
   int y, x;
   for (y = 0; y < size; ++y) {
     for (x = 0; x < size; ++x) {
@@ -51,7 +38,7 @@ static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) {
   return 1;
 }
 
-static void flatten(uint8_t* ptr, int v, int stride, int size) {
+static void Flatten(uint8_t* ptr, int v, int stride, int size) {
   int y;
   for (y = 0; y < size; ++y) {
     memset(ptr, v, size);
@@ -59,7 +46,7 @@ static void flatten(uint8_t* ptr, int v, int stride, int size) {
   }
 }
 
-static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) {
+static void FlattenARGB(uint32_t* ptr, uint32_t v, int stride, int size) {
   int x, y;
   for (y = 0; y < size; ++y) {
     for (x = 0; x < size; ++x) ptr[x] = v;
@@ -67,54 +54,114 @@ static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) {
   }
 }
 
+// Smoothen the luma components of transparent pixels. Return true if the whole
+// block is transparent.
+static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
+                         int y_stride, int width, int height) {
+  int sum = 0, count = 0;
+  int x, y;
+  const uint8_t* alpha_ptr = a_ptr;
+  uint8_t* luma_ptr = y_ptr;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      if (alpha_ptr[x] != 0) {
+        ++count;
+        sum += luma_ptr[x];
+      }
+    }
+    alpha_ptr += a_stride;
+    luma_ptr += y_stride;
+  }
+  if (count > 0 && count < width * height) {
+    const uint8_t avg_u8 = (uint8_t)(sum / count);
+    alpha_ptr = a_ptr;
+    luma_ptr = y_ptr;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        if (alpha_ptr[x] == 0) luma_ptr[x] = avg_u8;
+      }
+      alpha_ptr += a_stride;
+      luma_ptr += y_stride;
+    }
+  }
+  return (count == 0);
+}
+
 void WebPCleanupTransparentArea(WebPPicture* pic) {
   int x, y, w, h;
   if (pic == NULL) return;
   w = pic->width / SIZE;
   h = pic->height / SIZE;
 
-  // note: we ignore the left-overs on right/bottom
+  // note: we ignore the left-overs on right/bottom, except for SmoothenBlock().
   if (pic->use_argb) {
     uint32_t argb_value = 0;
     for (y = 0; y < h; ++y) {
       int need_reset = 1;
       for (x = 0; x < w; ++x) {
         const int off = (y * pic->argb_stride + x) * SIZE;
-        if (is_transparent_argb_area(pic->argb + off, pic->argb_stride, SIZE)) {
+        if (IsTransparentARGBArea(pic->argb + off, pic->argb_stride, SIZE)) {
           if (need_reset) {
             argb_value = pic->argb[off];
             need_reset = 0;
           }
-          flatten_argb(pic->argb + off, argb_value, pic->argb_stride, SIZE);
+          FlattenARGB(pic->argb + off, argb_value, pic->argb_stride, SIZE);
         } else {
           need_reset = 1;
         }
       }
     }
   } else {
-    const uint8_t* const a_ptr = pic->a;
+    const int width = pic->width;
+    const int height = pic->height;
+    const int y_stride = pic->y_stride;
+    const int uv_stride = pic->uv_stride;
+    const int a_stride = pic->a_stride;
+    uint8_t* y_ptr = pic->y;
+    uint8_t* u_ptr = pic->u;
+    uint8_t* v_ptr = pic->v;
+    const uint8_t* a_ptr = pic->a;
     int values[3] = { 0 };
-    if (a_ptr == NULL) return;    // nothing to do
-    for (y = 0; y < h; ++y) {
+    if (a_ptr == NULL || y_ptr == NULL || u_ptr == NULL || v_ptr == NULL) {
+      return;
+    }
+    for (y = 0; y + SIZE <= height; y += SIZE) {
       int need_reset = 1;
-      for (x = 0; x < w; ++x) {
-        const int off_a = (y * pic->a_stride + x) * SIZE;
-        const int off_y = (y * pic->y_stride + x) * SIZE;
-        const int off_uv = (y * pic->uv_stride + x) * SIZE2;
-        if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        if (SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                          SIZE, SIZE)) {
           if (need_reset) {
-            values[0] = pic->y[off_y];
-            values[1] = pic->u[off_uv];
-            values[2] = pic->v[off_uv];
+            values[0] = y_ptr[x];
+            values[1] = u_ptr[x >> 1];
+            values[2] = v_ptr[x >> 1];
             need_reset = 0;
           }
-          flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
-          flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
-          flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
+          Flatten(y_ptr + x,        values[0], y_stride,  SIZE);
+          Flatten(u_ptr + (x >> 1), values[1], uv_stride, SIZE2);
+          Flatten(v_ptr + (x >> 1), values[2], uv_stride, SIZE2);
         } else {
           need_reset = 1;
         }
       }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, SIZE);
+      }
+      a_ptr += SIZE * a_stride;
+      y_ptr += SIZE * y_stride;
+      u_ptr += SIZE2 * uv_stride;
+      v_ptr += SIZE2 * uv_stride;
+    }
+    if (y < height) {
+      const int sub_height = height - y;
+      for (x = 0; x + SIZE <= width; x += SIZE) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      SIZE, sub_height);
+      }
+      if (x < width) {
+        SmoothenBlock(a_ptr + x, a_stride, y_ptr + x, y_stride,
+                      width - x, sub_height);
+      }
     }
   }
 }
@@ -144,9 +191,9 @@ void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
 // Blend color and remove transparency info
 
 #define BLEND(V0, V1, ALPHA) \
-    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 256) >> 16)
 #define BLEND_10BIT(V0, V1, ALPHA) \
-    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101 + 1024) >> 18)
 
 void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
   const int red = (background_rgb >> 16) & 0xff;
diff --git a/thirdparty/libwebp/enc/predictor_enc.c b/thirdparty/libwebp/src/enc/predictor_enc.c
index 0639b74f1c..f3715f515e 100644
--- a/thirdparty/libwebp/enc/predictor_enc.c
+++ b/thirdparty/libwebp/src/enc/predictor_enc.c
@@ -14,9 +14,9 @@
 //          Urvang Joshi (urvang@google.com)
 //          Vincent Rabaud (vrabaud@google.com)
 
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "./vp8li_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/enc/vp8li_enc.h"
 
 #define MAX_DIFF_COST (1e30f)
 
@@ -26,7 +26,6 @@ static const uint32_t kMaskAlpha = 0xff000000;
 
 // Mostly used to reduce code size + readability
 static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
-static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
 
 //------------------------------------------------------------------------------
 // Methods to calculate Entropy (Shannon).
@@ -90,6 +89,9 @@ static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
   }
 }
 
+#if (WEBP_NEAR_LOSSLESS == 1)
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
 static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
   const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
   const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
@@ -180,6 +182,7 @@ static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
 // max_quantization which is a power of 2, smaller than max_diff). Take care if
 // value and predict have undergone subtract green, which means that red and
 // blue are represented as offsets from green.
+#define NEAR_LOSSLESS_DIFF(a, b) (uint8_t)((((int)(a) - (int)(b))) & 0xff)
 static uint32_t NearLossless(uint32_t value, uint32_t predict,
                              int max_quantization, int max_diff,
                              int used_subtract_green) {
@@ -196,7 +199,7 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
   }
   if ((value >> 24) == 0 || (value >> 24) == 0xff) {
     // Preserve transparency of fully transparent or fully opaque pixels.
-    a = ((value >> 24) - (predict >> 24)) & 0xff;
+    a = NEAR_LOSSLESS_DIFF(value >> 24, predict >> 24);
   } else {
     a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
   }
@@ -209,15 +212,17 @@ static uint32_t NearLossless(uint32_t value, uint32_t predict,
     // The amount by which green has been adjusted during quantization. It is
     // subtracted from red and blue for compensation, to avoid accumulating two
     // quantization errors in them.
-    green_diff = (new_green - (value >> 8)) & 0xff;
+    green_diff = NEAR_LOSSLESS_DIFF(new_green, value >> 8);
   }
-  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
+  r = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value >> 16, green_diff),
                             (predict >> 16) & 0xff, 0xff - new_green,
                             quantization);
-  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
-                            0xff - new_green, quantization);
+  b = NearLosslessComponent(NEAR_LOSSLESS_DIFF(value, green_diff),
+                            predict & 0xff, 0xff - new_green, quantization);
   return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
 }
+#undef NEAR_LOSSLESS_DIFF
+#endif  // (WEBP_NEAR_LOSSLESS == 1)
 
 // Stores the difference between the pixel and its prediction in "out".
 // In case of a lossy encoding, updates the source image to avoid propagating
@@ -244,6 +249,7 @@ static WEBP_INLINE void GetResidual(
       } else {
         predict = pred_func(current_row[x - 1], upper_row + x);
       }
+#if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
           x == 0 || x == width - 1) {
         residual = VP8LSubPixels(current_row[x], predict);
@@ -254,6 +260,13 @@ static WEBP_INLINE void GetResidual(
         current_row[x] = VP8LAddPixels(predict, residual);
         // x is never 0 here so we do not need to update upper_row like below.
       }
+#else
+      (void)max_diffs;
+      (void)height;
+      (void)max_quantization;
+      (void)used_subtract_green;
+      residual = VP8LSubPixels(current_row[x], predict);
+#endif
       if ((current_row[x] & kMaskAlpha) == 0) {
         // If alpha is 0, cleanup RGB. We can choose the RGB values of the
         // residual for best compression. The prediction of alpha itself can be
@@ -296,11 +309,12 @@ static int GetBestPredictorForTile(int width, int height,
   const int max_x = GetMin(tile_size, width - start_x);
   // Whether there exist columns just outside the tile.
   const int have_left = (start_x > 0);
-  const int have_right = (max_x < width - start_x);
   // Position and size of the strip covering the tile and adjacent columns if
   // they exist.
   const int context_start_x = start_x - have_left;
-  const int context_width = max_x + have_left + have_right;
+#if (WEBP_NEAR_LOSSLESS == 1)
+  const int context_width = max_x + have_left + (max_x < width - start_x);
+#endif
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   // Prediction modes of the left and above neighbor tiles.
   const int left_mode = (tile_x > 0) ?
@@ -352,10 +366,12 @@ static int GetBestPredictorForTile(int width, int height,
       memcpy(current_row + context_start_x,
              argb + y * width + context_start_x,
              sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+#if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization > 1 && y >= 1 && y + 1 < height) {
         MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
                        max_diffs + context_start_x, used_subtract_green);
       }
+#endif
 
       GetResidual(width, height, upper_row, current_row, max_diffs, mode,
                   start_x, start_x + max_x, y, max_quantization, exact,
@@ -405,7 +421,9 @@ static void CopyImageWithPrediction(int width, int height,
   uint32_t* upper_row = argb_scratch;
   uint32_t* current_row = upper_row + width + 1;
   uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+#if (WEBP_NEAR_LOSSLESS == 1)
   uint8_t* lower_max_diffs = current_max_diffs + width;
+#endif
   int y;
 
   for (y = 0; y < height; ++y) {
@@ -420,6 +438,7 @@ static void CopyImageWithPrediction(int width, int height,
       PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
                    argb + y * width);
     } else {
+#if (WEBP_NEAR_LOSSLESS == 1)
       if (max_quantization > 1) {
         // Compute max_diffs for the lower row now, because that needs the
         // contents of argb for the current row, which we will overwrite with
@@ -432,6 +451,7 @@ static void CopyImageWithPrediction(int width, int height,
                          used_subtract_green);
         }
       }
+#endif
       for (x = 0; x < width;) {
         const int mode =
             (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
diff --git a/thirdparty/libwebp/enc/quant_enc.c b/thirdparty/libwebp/src/enc/quant_enc.c
index b118fb2a13..3b1a3129b5 100644
--- a/thirdparty/libwebp/enc/quant_enc.c
+++ b/thirdparty/libwebp/src/enc/quant_enc.c
@@ -15,8 +15,8 @@
 #include <math.h>
 #include <stdlib.h>  // for abs()
 
-#include "./vp8i_enc.h"
-#include "./cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/cost_enc.h"
 
 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@@ -457,11 +457,11 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
 // Form the predictions in cache
 
 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
-const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
-const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
+const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
+const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
 
 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
-const int VP8I4ModeOffsets[NUM_BMODES] = {
+const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
 };
 
@@ -492,14 +492,14 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // |YYYY|....| 12
 // +----+----+
 
-const int VP8Scan[16] = {  // Luma
+const uint16_t VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
 };
 
-static const int VP8ScanUV[4 + 4] = {
+static const uint16_t VP8ScanUV[4 + 4] = {
   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
@@ -1162,7 +1162,7 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
+      const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
                           + VP8FixedCostsI16[mode] * lambda_d_i16;
       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
         continue;
diff --git a/thirdparty/libwebp/enc/syntax_enc.c b/thirdparty/libwebp/src/enc/syntax_enc.c
index 90665bd7e5..a9e5a6cf0f 100644
--- a/thirdparty/libwebp/enc/syntax_enc.c
+++ b/thirdparty/libwebp/src/enc/syntax_enc.c
@@ -13,10 +13,10 @@
 
 #include <assert.h>
 
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"  // RIFF constants
-#include "../webp/mux_types.h"         // ALPHA_FLAG
-#include "./vp8i_enc.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"  // RIFF constants
+#include "src/webp/mux_types.h"         // ALPHA_FLAG
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Helper functions
@@ -289,11 +289,17 @@ static int GeneratePartition0(VP8Encoder* const enc) {
 
   pos3 = VP8BitWriterPos(bw);
 
+#if !defined(WEBP_DISABLE_STATS)
   if (enc->pic_->stats) {
     enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
     enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
     enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
   }
+#else
+  (void)pos1;
+  (void)pos2;
+  (void)pos3;
+#endif
   if (bw->error_) {
     return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
diff --git a/thirdparty/libwebp/enc/token_enc.c b/thirdparty/libwebp/src/enc/token_enc.c
index 02a0d72cc6..3a2192acac 100644
--- a/thirdparty/libwebp/enc/token_enc.c
+++ b/thirdparty/libwebp/src/enc/token_enc.c
@@ -20,9 +20,9 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./cost_enc.h"
-#include "./vp8i_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/utils/utils.h"
 
 #if !defined(DISABLE_TOKEN_BUFFER)
 
@@ -195,39 +195,6 @@ int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
 #undef TOKEN_ID
 
 //------------------------------------------------------------------------------
-// This function works, but isn't currently used. Saved for later.
-
-#if 0
-
-static void Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-}
-
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
-  const VP8Tokens* p = b->pages_;
-  while (p != NULL) {
-    const int N = (p->next_ == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
-    const token_t* const tokens = TOKEN_DATA(p);
-    while (n-- > N) {
-      const token_t token = tokens[n];
-      if (!(token & FIXED_PROBA_BIT)) {
-        Record((token >> 15) & 1, stats + (token & 0x3fffu));
-      }
-    }
-    p = p->next_;
-  }
-}
-
-#endif   // 0
-
-//------------------------------------------------------------------------------
 // Final coding pass, with known probabilities
 
 int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
@@ -283,8 +250,9 @@ size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
 
 #else     // DISABLE_TOKEN_BUFFER
 
-void VP8TBufferInit(VP8TBuffer* const b) {
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
   (void)b;
+  (void)page_size;
 }
 void VP8TBufferClear(VP8TBuffer* const b) {
   (void)b;
diff --git a/thirdparty/libwebp/enc/tree_enc.c b/thirdparty/libwebp/src/enc/tree_enc.c
index 2c40fe7f3d..64ed28360b 100644
--- a/thirdparty/libwebp/enc/tree_enc.c
+++ b/thirdparty/libwebp/src/enc/tree_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i_enc.h"
+#include "src/enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Default probabilities
diff --git a/thirdparty/libwebp/enc/vp8i_enc.h b/thirdparty/libwebp/src/enc/vp8i_enc.h
index 93c95ecbfb..3463491e9d 100644
--- a/thirdparty/libwebp/enc/vp8i_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8i_enc.h
@@ -11,16 +11,16 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_ENC_VP8ENCI_H_
-#define WEBP_ENC_VP8ENCI_H_
+#ifndef WEBP_ENC_VP8I_ENC_H_
+#define WEBP_ENC_VP8I_ENC_H_
 
 #include <string.h>     // for memcpy()
-#include "../dec/common_dec.h"
-#include "../dsp/dsp.h"
-#include "../utils/bit_writer_utils.h"
-#include "../utils/thread_utils.h"
-#include "../utils/utils.h"
-#include "../webp/encode.h"
+#include "src/dec/common_dec.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/utils/thread_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/encode.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 6
-#define ENC_REV_VERSION 0
+#define ENC_REV_VERSION 1
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -75,10 +75,10 @@ typedef enum {   // Rate-distortion optimization levels
 #define U_OFF_ENC    (16)
 #define V_OFF_ENC    (16 + 8)
 
-extern const int VP8Scan[16];           // in quant.c
-extern const int VP8UVModeOffsets[4];   // in analyze.c
-extern const int VP8I16ModeOffsets[4];
-extern const int VP8I4ModeOffsets[NUM_BMODES];
+extern const uint16_t VP8Scan[16];
+extern const uint16_t VP8UVModeOffsets[4];
+extern const uint16_t VP8I16ModeOffsets[4];
+extern const uint16_t VP8I4ModeOffsets[NUM_BMODES];
 
 // Layout of prediction blocks
 // intra 16x16
@@ -330,9 +330,6 @@ int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
 // Estimate the final coded size given a set of 'probas'.
 size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas);
 
-// unused for now
-void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats);
-
 #endif  // !DISABLE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
@@ -502,19 +499,10 @@ int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 // compressibility (no guarantee, though). Assumes that pic->use_argb is true.
 void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
 
-  // in near_lossless.c
-// Near lossless preprocessing in RGB color-space.
-int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality);
-// Near lossless adjustment for predictors.
-void VP8ApplyNearLosslessPredict(int xsize, int ysize, int pred_bits,
-                                 const uint32_t* argb_orig,
-                                 uint32_t* argb, uint32_t* argb_scratch,
-                                 const uint32_t* const transform_data,
-                                 int quality, int subtract_green);
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_VP8ENCI_H_ */
+#endif  /* WEBP_ENC_VP8I_ENC_H_ */
diff --git a/thirdparty/libwebp/enc/vp8l_enc.c b/thirdparty/libwebp/src/enc/vp8l_enc.c
index b1a793d956..312e521906 100644
--- a/thirdparty/libwebp/enc/vp8l_enc.c
+++ b/thirdparty/libwebp/src/enc/vp8l_enc.c
@@ -15,20 +15,19 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "./vp8i_enc.h"
-#include "./vp8li_enc.h"
-#include "../dsp/lossless.h"
-#include "../dsp/lossless_common.h"
-#include "../utils/bit_writer_utils.h"
-#include "../utils/huffman_encode_utils.h"
-#include "../utils/utils.h"
-#include "../webp/format_constants.h"
-
-#include "./delta_palettization_enc.h"
-
-#define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/vp8li_enc.h"
+#include "src/dsp/lossless.h"
+#include "src/dsp/lossless_common.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/utils/huffman_encode_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
+
+#include "src/enc/delta_palettization_enc.h"
+
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600
 
@@ -128,7 +127,10 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
                                    uint32_t palette[MAX_PALETTE_SIZE],
                                    int* const palette_size) {
   const int num_colors = WebPGetColorPalette(pic, palette);
-  if (num_colors > MAX_PALETTE_SIZE) return 0;
+  if (num_colors > MAX_PALETTE_SIZE) {
+    *palette_size = 0;
+    return 0;
+  }
   *palette_size = num_colors;
   qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
   if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
@@ -188,22 +190,33 @@ static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
 static int AnalyzeEntropy(const uint32_t* argb,
                           int width, int height, int argb_stride,
                           int use_palette,
+                          int palette_size, int transform_bits,
                           EntropyIx* const min_entropy_ix,
                           int* const red_and_blue_always_zero) {
   // Allocate histogram set with cache_bits = 0.
-  uint32_t* const histo =
-      (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
+  uint32_t* histo;
+
+  if (use_palette && palette_size <= 16) {
+    // In the case of small palettes, we pack 2, 4 or 8 pixels together. In
+    // practice, small palettes are better than any other transform.
+    *min_entropy_ix = kPalette;
+    *red_and_blue_always_zero = 1;
+    return 1;
+  }
+  histo = (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
   if (histo != NULL) {
     int i, x, y;
-    const uint32_t* prev_row = argb;
-    const uint32_t* curr_row = argb + argb_stride;
-    for (y = 1; y < height; ++y) {
-      uint32_t prev_pix = curr_row[0];
-      for (x = 1; x < width; ++x) {
+    const uint32_t* prev_row = NULL;
+    const uint32_t* curr_row = argb;
+    uint32_t pix_prev = argb[0];  // Skip the first pixel.
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
         const uint32_t pix = curr_row[x];
-        const uint32_t pix_diff = VP8LSubPixels(pix, prev_pix);
-        if ((pix_diff == 0) || (pix == prev_row[x])) continue;
-        prev_pix = pix;
+        const uint32_t pix_diff = VP8LSubPixels(pix, pix_prev);
+        pix_prev = pix;
+        if ((pix_diff == 0) || (prev_row != NULL && pix == prev_row[x])) {
+          continue;
+        }
         AddSingle(pix,
                   &histo[kHistoAlpha * 256],
                   &histo[kHistoRed * 256],
@@ -264,8 +277,24 @@ static int AnalyzeEntropy(const uint32_t* argb,
           entropy_comp[kHistoRedPredSubGreen] +
           entropy_comp[kHistoGreenPred] +
           entropy_comp[kHistoBluePredSubGreen];
-      // Palette mode seems more efficient in a breakeven case. Bias with 1.0.
-      entropy[kPalette] = entropy_comp[kHistoPalette] - 1.0;
+      entropy[kPalette] = entropy_comp[kHistoPalette];
+
+      // When including transforms, there is an overhead in bits from
+      // storing them. This overhead is small but matters for small images.
+      // For spatial, there are 14 transformations.
+      entropy[kSpatial] += VP8LSubSampleSize(width, transform_bits) *
+                           VP8LSubSampleSize(height, transform_bits) *
+                           VP8LFastLog2(14);
+      // For color transforms: 24 as only 3 channels are considered in a
+      // ColorTransformElement.
+      entropy[kSpatialSubGreen] += VP8LSubSampleSize(width, transform_bits) *
+                                   VP8LSubSampleSize(height, transform_bits) *
+                                   VP8LFastLog2(24);
+      // For palettes, add the cost of storing the palette.
+      // We empirically estimate the cost of a compressed entry as 8 bits.
+      // The palette is differential-coded when compressed hence a much
+      // lower cost than sizeof(uint32_t)*8.
+      entropy[kPalette] += palette_size * 8;
 
       *min_entropy_ix = kDirect;
       for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
@@ -273,6 +302,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
           *min_entropy_ix = (EntropyIx)k;
         }
       }
+      assert((int)*min_entropy_ix <= last_mode_to_analyze);
       *red_and_blue_always_zero = 1;
       // Let's check if the histogram of the chosen entropy mode has
       // non-zero red and blue values. If all are zero, we can later skip
@@ -325,60 +355,95 @@ static int GetTransformBits(int method, int histo_bits) {
   return res;
 }
 
-static int AnalyzeAndInit(VP8LEncoder* const enc) {
+// Set of parameters to be used in each iteration of the cruncher.
+#define CRUNCH_CONFIGS_LZ77_MAX 2
+typedef struct {
+  int entropy_idx_;
+  int lz77s_types_to_try_[CRUNCH_CONFIGS_LZ77_MAX];
+  int lz77s_types_to_try_size_;
+} CrunchConfig;
+
+#define CRUNCH_CONFIGS_MAX kNumEntropyIx
+
+static int EncoderAnalyze(VP8LEncoder* const enc,
+                          CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX],
+                          int* const crunch_configs_size,
+                          int* const red_and_blue_always_zero) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
-  const int pix_cnt = width * height;
   const WebPConfig* const config = enc->config_;
   const int method = config->method;
   const int low_effort = (config->method == 0);
-  // we round the block size up, so we're guaranteed to have
-  // at max MAX_REFS_BLOCK_PER_IMAGE blocks used:
-  int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
+  int i;
+  int use_palette;
+  int n_lz77s;
   assert(pic != NULL && pic->argb != NULL);
 
-  enc->use_cross_color_ = 0;
-  enc->use_predict_ = 0;
-  enc->use_subtract_green_ = 0;
-  enc->use_palette_ =
+  use_palette =
       AnalyzeAndCreatePalette(pic, low_effort,
                               enc->palette_, &enc->palette_size_);
 
   // TODO(jyrki): replace the decision to be based on an actual estimate
   // of entropy, or even spatial variance of entropy.
-  enc->histo_bits_ = GetHistoBits(method, enc->use_palette_,
+  enc->histo_bits_ = GetHistoBits(method, use_palette,
                                   pic->width, pic->height);
   enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
 
   if (low_effort) {
     // AnalyzeEntropy is somewhat slow.
-    enc->use_predict_ = !enc->use_palette_;
-    enc->use_subtract_green_ = !enc->use_palette_;
-    enc->use_cross_color_ = 0;
+    crunch_configs[0].entropy_idx_ = use_palette ? kPalette : kSpatialSubGreen;
+    n_lz77s = 1;
+    *crunch_configs_size = 1;
   } else {
-    int red_and_blue_always_zero;
     EntropyIx min_entropy_ix;
-    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
-                        enc->use_palette_, &min_entropy_ix,
-                        &red_and_blue_always_zero)) {
+    // Try out multiple LZ77 on images with few colors.
+    n_lz77s = (enc->palette_size_ > 0 && enc->palette_size_ <= 16) ? 2 : 1;
+    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride, use_palette,
+                        enc->palette_size_, enc->transform_bits_,
+                        &min_entropy_ix, red_and_blue_always_zero)) {
       return 0;
     }
-    enc->use_palette_ = (min_entropy_ix == kPalette);
-    enc->use_subtract_green_ =
-        (min_entropy_ix == kSubGreen) || (min_entropy_ix == kSpatialSubGreen);
-    enc->use_predict_ =
-        (min_entropy_ix == kSpatial) || (min_entropy_ix == kSpatialSubGreen);
-    enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
+    if (method == 6 && config->quality == 100) {
+      // Go brute force on all transforms.
+      *crunch_configs_size = 0;
+      for (i = 0; i < kNumEntropyIx; ++i) {
+        if (i != kPalette || use_palette) {
+          assert(*crunch_configs_size < CRUNCH_CONFIGS_MAX);
+          crunch_configs[(*crunch_configs_size)++].entropy_idx_ = i;
+        }
+      }
+    } else {
+      // Only choose the guessed best transform.
+      *crunch_configs_size = 1;
+      crunch_configs[0].entropy_idx_ = min_entropy_ix;
+    }
+  }
+  // Fill in the different LZ77s.
+  assert(n_lz77s <= CRUNCH_CONFIGS_LZ77_MAX);
+  for (i = 0; i < *crunch_configs_size; ++i) {
+    int j;
+    for (j = 0; j < n_lz77s; ++j) {
+      crunch_configs[i].lz77s_types_to_try_[j] =
+          (j == 0) ? kLZ77Standard | kLZ77RLE : kLZ77Box;
+    }
+    crunch_configs[i].lz77s_types_to_try_size_ = n_lz77s;
   }
+  return 1;
+}
 
+static int EncoderInit(VP8LEncoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const int pix_cnt = width * height;
+  // we round the block size up, so we're guaranteed to have
+  // at most MAX_REFS_BLOCK_PER_IMAGE blocks used:
+  const int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
+  int i;
   if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
 
-  // palette-friendly input typically uses less literals
-  //  -> reduce block size a bit
-  if (enc->use_palette_) refs_block_size /= 2;
-  VP8LBackwardRefsInit(&enc->refs_[0], refs_block_size);
-  VP8LBackwardRefsInit(&enc->refs_[1], refs_block_size);
+  for (i = 0; i < 3; ++i) VP8LBackwardRefsInit(&enc->refs_[i], refs_block_size);
 
   return 1;
 }
@@ -571,11 +636,16 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
     length = write_trimmed_length ? trimmed_length : num_tokens;
     VP8LPutBits(bw, write_trimmed_length, 1);
     if (write_trimmed_length) {
-      const int nbits = VP8LBitsLog2Ceiling(trimmed_length - 1);
-      const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
-      VP8LPutBits(bw, nbitpairs - 1, 3);
-      assert(trimmed_length >= 2);
-      VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
+      if (trimmed_length == 2) {
+        VP8LPutBits(bw, 0, 3 + 2);     // nbitpairs=1, trimmed_length=2
+      } else {
+        const int nbits = BitsLog2Floor(trimmed_length - 2);
+        const int nbitpairs = nbits / 2 + 1;
+        assert(trimmed_length > 2);
+        assert(nbitpairs - 1 < 8);
+        VP8LPutBits(bw, nbitpairs - 1, 3);
+        VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
+      }
     }
     StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
   }
@@ -642,7 +712,7 @@ static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
 
 static WebPEncodingError StoreImageToBitMask(
     VP8LBitWriter* const bw, int width, int histo_bits,
-    VP8LBackwardRefs* const refs,
+    const VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
     const HuffmanTreeCode* const huffman_codes) {
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
@@ -665,7 +735,7 @@ static WebPEncodingError StoreImageToBitMask(
       codes = huffman_codes + 5 * histogram_ix;
     }
     if (PixOrCopyIsLiteral(v)) {
-      static const int order[] = { 1, 2, 0, 3 };
+      static const uint8_t order[] = { 1, 2, 0, 3 };
       int k;
       for (k = 0; k < 4; ++k) {
         const int code = PixOrCopyLiteral(v, order[k]);
@@ -705,7 +775,8 @@ static WebPEncodingError StoreImageToBitMask(
 static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
                                               const uint32_t* const argb,
                                               VP8LHashChain* const hash_chain,
-                                              VP8LBackwardRefs refs_array[2],
+                                              VP8LBackwardRefs* const refs_tmp1,
+                                              VP8LBackwardRefs* const refs_tmp2,
                                               int width, int height,
                                               int quality, int low_effort) {
   int i;
@@ -730,8 +801,9 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
-  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, &cache_bits,
-                                   hash_chain, refs_array);
+  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0,
+                                   kLZ77Standard | kLZ77RLE, &cache_bits,
+                                   hash_chain, refs_tmp1, refs_tmp2);
   if (refs == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -788,39 +860,37 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   return err;
 }
 
-static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
-                                             const uint32_t* const argb,
-                                             VP8LHashChain* const hash_chain,
-                                             VP8LBackwardRefs refs_array[2],
-                                             int width, int height, int quality,
-                                             int low_effort,
-                                             int use_cache, int* cache_bits,
-                                             int histogram_bits,
-                                             size_t init_byte_position,
-                                             int* const hdr_size,
-                                             int* const data_size) {
+static WebPEncodingError EncodeImageInternal(
+    VP8LBitWriter* const bw, const uint32_t* const argb,
+    VP8LHashChain* const hash_chain, VP8LBackwardRefs refs_array[3], int width,
+    int height, int quality, int low_effort, int use_cache,
+    const CrunchConfig* const config, int* cache_bits, int histogram_bits,
+    size_t init_byte_position, int* const hdr_size, int* const data_size) {
   WebPEncodingError err = VP8_ENC_OK;
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
   VP8LHistogramSet* histogram_image = NULL;
-  VP8LHistogramSet* tmp_histos = NULL;
+  VP8LHistogram* tmp_histo = NULL;
   int histogram_image_size = 0;
   size_t bit_array_size = 0;
-  HuffmanTree* huff_tree = NULL;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+      3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
-  VP8LBackwardRefs refs;
-  VP8LBackwardRefs* best_refs;
+  VP8LBackwardRefs* refs_best;
+  VP8LBackwardRefs* refs_tmp;
   uint16_t* const histogram_symbols =
       (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
                                 sizeof(*histogram_symbols));
+  int lz77s_idx;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  int hdr_size_tmp;
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
   assert(hdr_size != NULL);
   assert(data_size != NULL);
 
-  VP8LBackwardRefsInit(&refs, refs_array[0].block_size_);
   if (histogram_symbols == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -836,142 +906,162 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
-  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
-                         low_effort)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  best_refs = VP8LGetBackwardReferences(width, height, argb, quality,
-                                        low_effort, cache_bits, hash_chain,
-                                        refs_array);
-  if (best_refs == NULL || !VP8LBackwardRefsCopy(best_refs, &refs)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  histogram_image =
-      VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
-  tmp_histos = VP8LAllocateHistogramSet(2, *cache_bits);
-  if (histogram_image == NULL || tmp_histos == NULL) {
+  if (huff_tree == NULL ||
+      !VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort) ||
+      !VP8LBitWriterInit(&bw_best, 0) ||
+      (config->lz77s_types_to_try_size_ > 1 &&
+       !VP8LBitWriterClone(bw, &bw_best))) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
+  for (lz77s_idx = 0; lz77s_idx < config->lz77s_types_to_try_size_;
+       ++lz77s_idx) {
+    refs_best = VP8LGetBackwardReferences(
+        width, height, argb, quality, low_effort,
+        config->lz77s_types_to_try_[lz77s_idx], cache_bits, hash_chain,
+        &refs_array[0], &refs_array[1]);
+    if (refs_best == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    // Keep the best references aside and use the other element from the first
+    // two as a temporary for later usage.
+    refs_tmp = &refs_array[refs_best == &refs_array[0] ? 1 : 0];
+
+    histogram_image =
+        VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
+    tmp_histo = VP8LAllocateHistogram(*cache_bits);
+    if (histogram_image == NULL || tmp_histo == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
 
-  // Build histogram image and symbols from backward references.
-  if (!VP8LGetHistoImageSymbols(width, height, &refs, quality, low_effort,
-                                histogram_bits, *cache_bits, histogram_image,
-                                tmp_histos, histogram_symbols)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  // Create Huffman bit lengths and codes for each histogram image.
-  histogram_image_size = histogram_image->size;
-  bit_array_size = 5 * histogram_image_size;
-  huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
-                                                   sizeof(*huffman_codes));
-  // Note: some histogram_image entries may point to tmp_histos[], so the latter
-  // need to outlive the following call to GetHuffBitLengthsAndCodes().
-  if (huffman_codes == NULL ||
-      !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-  // Free combined histograms.
-  VP8LFreeHistogramSet(histogram_image);
-  histogram_image = NULL;
+    // Build histogram image and symbols from backward references.
+    if (!VP8LGetHistoImageSymbols(width, height, refs_best, quality, low_effort,
+                                  histogram_bits, *cache_bits, histogram_image,
+                                  tmp_histo, histogram_symbols)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    // Create Huffman bit lengths and codes for each histogram image.
+    histogram_image_size = histogram_image->size;
+    bit_array_size = 5 * histogram_image_size;
+    huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
+                                                     sizeof(*huffman_codes));
+    // Note: some histogram_image entries may point to tmp_histos[], so the
+    // latter need to outlive the following call to GetHuffBitLengthsAndCodes().
+    if (huffman_codes == NULL ||
+        !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    // Free combined histograms.
+    VP8LFreeHistogramSet(histogram_image);
+    histogram_image = NULL;
 
-  // Free scratch histograms.
-  VP8LFreeHistogramSet(tmp_histos);
-  tmp_histos = NULL;
+    // Free scratch histograms.
+    VP8LFreeHistogram(tmp_histo);
+    tmp_histo = NULL;
 
-  // Color Cache parameters.
-  if (*cache_bits > 0) {
-    VP8LPutBits(bw, 1, 1);
-    VP8LPutBits(bw, *cache_bits, 4);
-  } else {
-    VP8LPutBits(bw, 0, 1);
-  }
+    // Color Cache parameters.
+    if (*cache_bits > 0) {
+      VP8LPutBits(bw, 1, 1);
+      VP8LPutBits(bw, *cache_bits, 4);
+    } else {
+      VP8LPutBits(bw, 0, 1);
+    }
 
-  // Huffman image + meta huffman.
-  {
-    const int write_histogram_image = (histogram_image_size > 1);
-    VP8LPutBits(bw, write_histogram_image, 1);
-    if (write_histogram_image) {
-      uint32_t* const histogram_argb =
-          (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
-                                    sizeof(*histogram_argb));
-      int max_index = 0;
-      uint32_t i;
-      if (histogram_argb == NULL) {
-        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-        goto Error;
-      }
-      for (i = 0; i < histogram_image_xysize; ++i) {
-        const int symbol_index = histogram_symbols[i] & 0xffff;
-        histogram_argb[i] = (symbol_index << 8);
-        if (symbol_index >= max_index) {
-          max_index = symbol_index + 1;
+    // Huffman image + meta huffman.
+    {
+      const int write_histogram_image = (histogram_image_size > 1);
+      VP8LPutBits(bw, write_histogram_image, 1);
+      if (write_histogram_image) {
+        uint32_t* const histogram_argb =
+            (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
+                                      sizeof(*histogram_argb));
+        int max_index = 0;
+        uint32_t i;
+        if (histogram_argb == NULL) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        for (i = 0; i < histogram_image_xysize; ++i) {
+          const int symbol_index = histogram_symbols[i] & 0xffff;
+          histogram_argb[i] = (symbol_index << 8);
+          if (symbol_index >= max_index) {
+            max_index = symbol_index + 1;
+          }
         }
+        histogram_image_size = max_index;
+
+        VP8LPutBits(bw, histogram_bits - 2, 3);
+        err = EncodeImageNoHuffman(
+            bw, histogram_argb, hash_chain, refs_tmp, &refs_array[2],
+            VP8LSubSampleSize(width, histogram_bits),
+            VP8LSubSampleSize(height, histogram_bits), quality, low_effort);
+        WebPSafeFree(histogram_argb);
+        if (err != VP8_ENC_OK) goto Error;
       }
-      histogram_image_size = max_index;
-
-      VP8LPutBits(bw, histogram_bits - 2, 3);
-      err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
-                                 VP8LSubSampleSize(width, histogram_bits),
-                                 VP8LSubSampleSize(height, histogram_bits),
-                                 quality, low_effort);
-      WebPSafeFree(histogram_argb);
-      if (err != VP8_ENC_OK) goto Error;
     }
-  }
 
-  // Store Huffman codes.
-  {
-    int i;
-    int max_tokens = 0;
-    huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * CODE_LENGTH_CODES,
-                                             sizeof(*huff_tree));
-    if (huff_tree == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
-    }
-    // Find maximum number of symbols for the huffman tree-set.
-    for (i = 0; i < 5 * histogram_image_size; ++i) {
-      HuffmanTreeCode* const codes = &huffman_codes[i];
-      if (max_tokens < codes->num_symbols) {
-        max_tokens = codes->num_symbols;
+    // Store Huffman codes.
+    {
+      int i;
+      int max_tokens = 0;
+      // Find maximum number of symbols for the huffman tree-set.
+      for (i = 0; i < 5 * histogram_image_size; ++i) {
+        HuffmanTreeCode* const codes = &huffman_codes[i];
+        if (max_tokens < codes->num_symbols) {
+          max_tokens = codes->num_symbols;
+        }
+      }
+      tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+      if (tokens == NULL) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
+      for (i = 0; i < 5 * histogram_image_size; ++i) {
+        HuffmanTreeCode* const codes = &huffman_codes[i];
+        StoreHuffmanCode(bw, huff_tree, tokens, codes);
+        ClearHuffmanTreeIfOnlyOneSymbol(codes);
       }
     }
-    tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens,
-                                               sizeof(*tokens));
-    if (tokens == NULL) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+    // Store actual literals.
+    hdr_size_tmp = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
+    err = StoreImageToBitMask(bw, width, histogram_bits, refs_best,
+                              histogram_symbols, huffman_codes);
+    // Keep track of the smallest image so far.
+    if (lz77s_idx == 0 ||
+        VP8LBitWriterNumBytes(bw) < VP8LBitWriterNumBytes(&bw_best)) {
+      *hdr_size = hdr_size_tmp;
+      *data_size =
+          (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+      VP8LBitWriterSwap(bw, &bw_best);
     }
-    for (i = 0; i < 5 * histogram_image_size; ++i) {
-      HuffmanTreeCode* const codes = &huffman_codes[i];
-      StoreHuffmanCode(bw, huff_tree, tokens, codes);
-      ClearHuffmanTreeIfOnlyOneSymbol(codes);
+    // Reset the bit writer for the following iteration if any.
+    if (config->lz77s_types_to_try_size_ > 1) VP8LBitWriterReset(&bw_init, bw);
+    WebPSafeFree(tokens);
+    tokens = NULL;
+    if (huffman_codes != NULL) {
+      WebPSafeFree(huffman_codes->codes);
+      WebPSafeFree(huffman_codes);
+      huffman_codes = NULL;
     }
   }
-
-  *hdr_size = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
-  // Store actual literals.
-  err = StoreImageToBitMask(bw, width, histogram_bits, &refs,
-                            histogram_symbols, huffman_codes);
-  *data_size =
-        (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
+  VP8LBitWriterSwap(bw, &bw_best);
 
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
-  VP8LFreeHistogramSet(tmp_histos);
-  VP8LBackwardRefsClear(&refs);
+  VP8LFreeHistogram(tmp_histo);
   if (huffman_codes != NULL) {
     WebPSafeFree(huffman_codes->codes);
     WebPSafeFree(huffman_codes);
   }
   WebPSafeFree(histogram_symbols);
+  VP8LBitWriterWipeOut(&bw_best);
   return err;
 }
 
@@ -1005,11 +1095,11 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   assert(pred_bits >= 2);
   VP8LPutBits(bw, pred_bits - 2, 3);
-  return EncodeImageNoHuffman(bw, enc->transform_data_,
-                              (VP8LHashChain*)&enc->hash_chain_,
-                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
-                              transform_width, transform_height,
-                              quality, low_effort);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
+      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
+      quality, low_effort);
 }
 
 static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
@@ -1026,11 +1116,11 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
   VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
   assert(ccolor_transform_bits >= 2);
   VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
-  return EncodeImageNoHuffman(bw, enc->transform_data_,
-                              (VP8LHashChain*)&enc->hash_chain_,
-                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
-                              transform_width, transform_height,
-                              quality, low_effort);
+  return EncodeImageNoHuffman(
+      bw, enc->transform_data_, (VP8LHashChain*)&enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
+      (VP8LBackwardRefs*)&enc->refs_[1], transform_width, transform_height,
+      quality, low_effort);
 }
 
 // -----------------------------------------------------------------------------
@@ -1144,6 +1234,7 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
     }
     enc->transform_mem_ = mem;
     enc->transform_mem_size_ = (size_t)mem_size;
+    enc->argb_content_ = kEncoderNone;
   }
   enc->argb_ = mem;
   mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
@@ -1164,11 +1255,13 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
   int y;
   err = AllocateTransformBuffer(enc, width, height);
   if (err != VP8_ENC_OK) return err;
+  if (enc->argb_content_ == kEncoderARGB) return VP8_ENC_OK;
   for (y = 0; y < height; ++y) {
     memcpy(enc->argb_ + y * width,
            picture->argb + y * picture->argb_stride,
            width * sizeof(*enc->argb_));
   }
+  enc->argb_content_ = kEncoderARGB;
   assert(enc->current_width_ == width);
   return VP8_ENC_OK;
 }
@@ -1215,12 +1308,13 @@ static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
 
 static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
   // Forget about alpha.
-  return ((color & 0x00ffffffu) * 4222244071u) >> (32 - PALETTE_INV_SIZE_BITS);
+  return ((uint32_t)((color & 0x00ffffffu) * 4222244071ull)) >>
+         (32 - PALETTE_INV_SIZE_BITS);
 }
 
 static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
   // Forget about alpha.
-  return (color & 0x00ffffffu) * ((1u << 31) - 1) >>
+  return ((uint32_t)((color & 0x00ffffffu) * ((1ull << 31) - 1))) >>
          (32 - PALETTE_INV_SIZE_BITS);
 }
 
@@ -1346,6 +1440,7 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
   err = ApplyPalette(src, src_stride,
                      enc->argb_, enc->current_width_,
                      palette, palette_size, width, height, xbits);
+  enc->argb_content_ = kEncoderPalette;
   return err;
 }
 
@@ -1364,8 +1459,9 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
     tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
   }
   tmp_palette[0] = palette[0];
-  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_,
-                              palette_size, 1, 20 /* quality */, low_effort);
+  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_,
+                              &enc->refs_[0], &enc->refs_[1], palette_size, 1,
+                              20 /* quality */, low_effort);
 }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
@@ -1400,10 +1496,11 @@ static WebPEncodingError EncodeDeltaPalettePredictorImage(
   VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
   VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   VP8LPutBits(bw, pred_bits - 2, 3);
-  err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_,
-                             (VP8LBackwardRefs*)enc->refs_,  // cast const away
-                             transform_width, transform_height,
-                             quality, low_effort);
+  err = EncodeImageNoHuffman(
+      bw, predictors, &enc->hash_chain_,
+      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
+      (VP8LBackwardRefs*)&enc->refs_[1],
+      transform_width, transform_height, quality, low_effort);
   WebPSafeFree(predictors);
   return err;
 }
@@ -1422,6 +1519,7 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
   }
   enc->config_ = config;
   enc->pic_ = picture;
+  enc->argb_content_ = kEncoderNone;
 
   VP8LEncDspInit();
 
@@ -1430,9 +1528,9 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
 
 static void VP8LEncoderDelete(VP8LEncoder* enc) {
   if (enc != NULL) {
+    int i;
     VP8LHashChainClear(&enc->hash_chain_);
-    VP8LBackwardRefsClear(&enc->refs_[0]);
-    VP8LBackwardRefsClear(&enc->refs_[1]);
+    for (i = 0; i < 3; ++i) VP8LBackwardRefsClear(&enc->refs_[i]);
     ClearTransformBuffer(enc);
     WebPSafeFree(enc);
   }
@@ -1441,134 +1539,347 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) {
 // -----------------------------------------------------------------------------
 // Main call
 
-WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
-                                   const WebPPicture* const picture,
-                                   VP8LBitWriter* const bw, int use_cache) {
+typedef struct {
+  const WebPConfig* config_;
+  const WebPPicture* picture_;
+  VP8LBitWriter* bw_;
+  VP8LEncoder* enc_;
+  int use_cache_;
+  CrunchConfig crunch_configs_[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_;
+  int red_and_blue_always_zero_;
+  WebPEncodingError err_;
+  WebPAuxStats* stats_;
+} StreamEncodeContext;
+
+static int EncodeStreamHook(void* input, void* data2) {
+  StreamEncodeContext* const params = (StreamEncodeContext*)input;
+  const WebPConfig* const config = params->config_;
+  const WebPPicture* const picture = params->picture_;
+  VP8LBitWriter* const bw = params->bw_;
+  VP8LEncoder* const enc = params->enc_;
+  const int use_cache = params->use_cache_;
+  const CrunchConfig* const crunch_configs = params->crunch_configs_;
+  const int num_crunch_configs = params->num_crunch_configs_;
+  const int red_and_blue_always_zero = params->red_and_blue_always_zero_;
+#if !defined(WEBP_DISABLE_STATS)
+  WebPAuxStats* const stats = params->stats_;
+#endif
   WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
   const int low_effort = (config->method == 0);
+#if (WEBP_NEAR_LOSSLESS == 1) || defined(WEBP_EXPERIMENTAL_FEATURES)
   const int width = picture->width;
+#endif
   const int height = picture->height;
-  VP8LEncoder* const enc = VP8LEncoderNew(config, picture);
   const size_t byte_position = VP8LBitWriterNumBytes(bw);
+#if (WEBP_NEAR_LOSSLESS == 1)
   int use_near_lossless = 0;
+#endif
   int hdr_size = 0;
   int data_size = 0;
   int use_delta_palette = 0;
+  int idx;
+  size_t best_size = 0;
+  VP8LBitWriter bw_init = *bw, bw_best;
+  (void)data2;
 
-  if (enc == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-
-  // ---------------------------------------------------------------------------
-  // Analyze image (entropy, num_palettes etc)
-
-  if (!AnalyzeAndInit(enc)) {
+  if (!VP8LBitWriterInit(&bw_best, 0) ||
+      (num_crunch_configs > 1 && !VP8LBitWriterClone(bw, &bw_best))) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
-  // Apply near-lossless preprocessing.
-  use_near_lossless =
-      (config->near_lossless < 100) && !enc->use_palette_ && !enc->use_predict_;
-  if (use_near_lossless) {
-    if (!VP8ApplyNearLossless(width, height, picture->argb,
-                              config->near_lossless)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+  for (idx = 0; idx < num_crunch_configs; ++idx) {
+    const int entropy_idx = crunch_configs[idx].entropy_idx_;
+    enc->use_palette_ = (entropy_idx == kPalette);
+    enc->use_subtract_green_ =
+        (entropy_idx == kSubGreen) || (entropy_idx == kSpatialSubGreen);
+    enc->use_predict_ =
+        (entropy_idx == kSpatial) || (entropy_idx == kSpatialSubGreen);
+    if (low_effort) {
+      enc->use_cross_color_ = 0;
+    } else {
+      enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
     }
-  }
+    // Reset any parameter in the encoder that is set in the previous iteration.
+    enc->cache_bits_ = 0;
+    VP8LBackwardRefsClear(&enc->refs_[0]);
+    VP8LBackwardRefsClear(&enc->refs_[1]);
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->use_delta_palette) {
-    enc->use_predict_ = 1;
-    enc->use_cross_color_ = 0;
-    enc->use_subtract_green_ = 0;
-    enc->use_palette_ = 1;
-    err = MakeInputImageCopy(enc);
-    if (err != VP8_ENC_OK) goto Error;
-    err = WebPSearchOptimalDeltaPalette(enc);
-    if (err != VP8_ENC_OK) goto Error;
-    if (enc->use_palette_) {
+#if (WEBP_NEAR_LOSSLESS == 1)
+    // Apply near-lossless preprocessing.
+    use_near_lossless = (config->near_lossless < 100) && !enc->use_palette_ &&
+                        !enc->use_predict_;
+    if (use_near_lossless) {
       err = AllocateTransformBuffer(enc, width, height);
       if (err != VP8_ENC_OK) goto Error;
-      err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
+      if ((enc->argb_content_ != kEncoderNearLossless) &&
+          !VP8ApplyNearLossless(picture, config->near_lossless, enc->argb_)) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
+      enc->argb_content_ = kEncoderNearLossless;
+    } else {
+      enc->argb_content_ = kEncoderNone;
+    }
+#else
+    enc->argb_content_ = kEncoderNone;
+#endif
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+    if (config->use_delta_palette) {
+      enc->use_predict_ = 1;
+      enc->use_cross_color_ = 0;
+      enc->use_subtract_green_ = 0;
+      enc->use_palette_ = 1;
+      if (enc->argb_content_ != kEncoderNearLossless &&
+          enc->argb_content_ != kEncoderPalette) {
+        err = MakeInputImageCopy(enc);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+      err = WebPSearchOptimalDeltaPalette(enc);
       if (err != VP8_ENC_OK) goto Error;
-      use_delta_palette = 1;
+      if (enc->use_palette_) {
+        err = AllocateTransformBuffer(enc, width, height);
+        if (err != VP8_ENC_OK) goto Error;
+        err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
+        if (err != VP8_ENC_OK) goto Error;
+        use_delta_palette = 1;
+      }
     }
-  }
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
-  // Encode palette
-  if (enc->use_palette_) {
-    err = EncodePalette(bw, low_effort, enc);
-    if (err != VP8_ENC_OK) goto Error;
-    err = MapImageFromPalette(enc, use_delta_palette);
-    if (err != VP8_ENC_OK) goto Error;
-    // If using a color cache, do not have it bigger than the number of colors.
-    if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
-      enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
-    }
-  }
-  if (!use_delta_palette) {
-    // In case image is not packed.
-    if (enc->argb_ == NULL) {
-      err = MakeInputImageCopy(enc);
+    // Encode palette
+    if (enc->use_palette_) {
+      err = EncodePalette(bw, low_effort, enc);
+      if (err != VP8_ENC_OK) goto Error;
+      err = MapImageFromPalette(enc, use_delta_palette);
       if (err != VP8_ENC_OK) goto Error;
+      // If using a color cache, do not have it bigger than the number of
+      // colors.
+      if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+        enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+      }
     }
+    if (!use_delta_palette) {
+      // In case image is not packed.
+      if (enc->argb_content_ != kEncoderNearLossless &&
+          enc->argb_content_ != kEncoderPalette) {
+        err = MakeInputImageCopy(enc);
+        if (err != VP8_ENC_OK) goto Error;
+      }
 
-    // -------------------------------------------------------------------------
-    // Apply transforms and write transform data.
+      // -----------------------------------------------------------------------
+      // Apply transforms and write transform data.
 
-    if (enc->use_subtract_green_) {
-      ApplySubtractGreen(enc, enc->current_width_, height, bw);
-    }
+      if (enc->use_subtract_green_) {
+        ApplySubtractGreen(enc, enc->current_width_, height, bw);
+      }
 
-    if (enc->use_predict_) {
-      err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
-                               low_effort, enc->use_subtract_green_, bw);
-      if (err != VP8_ENC_OK) goto Error;
+      if (enc->use_predict_) {
+        err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                                 low_effort, enc->use_subtract_green_, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
+
+      if (enc->use_cross_color_) {
+        err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality,
+                                    low_effort, bw);
+        if (err != VP8_ENC_OK) goto Error;
+      }
     }
 
-    if (enc->use_cross_color_) {
-      err = ApplyCrossColorFilter(enc, enc->current_width_,
-                                  height, quality, low_effort, bw);
-      if (err != VP8_ENC_OK) goto Error;
+    VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+
+    // -------------------------------------------------------------------------
+    // Encode and write the transformed image.
+    err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
+                              enc->current_width_, height, quality, low_effort,
+                              use_cache, &crunch_configs[idx],
+                              &enc->cache_bits_, enc->histo_bits_,
+                              byte_position, &hdr_size, &data_size);
+    if (err != VP8_ENC_OK) goto Error;
+
+    // If we are better than what we already have.
+    if (idx == 0 || VP8LBitWriterNumBytes(bw) < best_size) {
+      best_size = VP8LBitWriterNumBytes(bw);
+      // Store the BitWriter.
+      VP8LBitWriterSwap(bw, &bw_best);
+#if !defined(WEBP_DISABLE_STATS)
+      // Update the stats.
+      if (stats != NULL) {
+        stats->lossless_features = 0;
+        if (enc->use_predict_) stats->lossless_features |= 1;
+        if (enc->use_cross_color_) stats->lossless_features |= 2;
+        if (enc->use_subtract_green_) stats->lossless_features |= 4;
+        if (enc->use_palette_) stats->lossless_features |= 8;
+        stats->histogram_bits = enc->histo_bits_;
+        stats->transform_bits = enc->transform_bits_;
+        stats->cache_bits = enc->cache_bits_;
+        stats->palette_size = enc->palette_size_;
+        stats->lossless_size = (int)(best_size - byte_position);
+        stats->lossless_hdr_size = hdr_size;
+        stats->lossless_data_size = data_size;
+      }
+#endif
     }
+    // Reset the bit writer for the following iteration if any.
+    if (num_crunch_configs > 1) VP8LBitWriterReset(&bw_init, bw);
   }
+  VP8LBitWriterSwap(&bw_best, bw);
 
-  VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+Error:
+  VP8LBitWriterWipeOut(&bw_best);
+  params->err_ = err;
+  // The hook should return false in case of error.
+  return (err == VP8_ENC_OK);
+}
 
-  // ---------------------------------------------------------------------------
-  // Encode and write the transformed image.
-  err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
-                            enc->current_width_, height, quality, low_effort,
-                            use_cache, &enc->cache_bits_, enc->histo_bits_,
-                            byte_position, &hdr_size, &data_size);
-  if (err != VP8_ENC_OK) goto Error;
+WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
+                                   const WebPPicture* const picture,
+                                   VP8LBitWriter* const bw_main,
+                                   int use_cache) {
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LEncoder* const enc_main = VP8LEncoderNew(config, picture);
+  VP8LEncoder* enc_side = NULL;
+  CrunchConfig crunch_configs[CRUNCH_CONFIGS_MAX];
+  int num_crunch_configs_main, num_crunch_configs_side = 0;
+  int idx;
+  int red_and_blue_always_zero = 0;
+  WebPWorker worker_main, worker_side;
+  StreamEncodeContext params_main, params_side;
+  // The main thread uses picture->stats, the side thread uses stats_side.
+  WebPAuxStats stats_side;
+  VP8LBitWriter bw_side;
+  const WebPWorkerInterface* const worker_interface = WebPGetWorkerInterface();
+  int ok_main;
 
-  if (picture->stats != NULL) {
-    WebPAuxStats* const stats = picture->stats;
-    stats->lossless_features = 0;
-    if (enc->use_predict_) stats->lossless_features |= 1;
-    if (enc->use_cross_color_) stats->lossless_features |= 2;
-    if (enc->use_subtract_green_) stats->lossless_features |= 4;
-    if (enc->use_palette_) stats->lossless_features |= 8;
-    stats->histogram_bits = enc->histo_bits_;
-    stats->transform_bits = enc->transform_bits_;
-    stats->cache_bits = enc->cache_bits_;
-    stats->palette_size = enc->palette_size_;
-    stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position);
-    stats->lossless_hdr_size = hdr_size;
-    stats->lossless_data_size = data_size;
+  // Analyze image (entropy, num_palettes etc)
+  if (enc_main == NULL ||
+      !EncoderAnalyze(enc_main, crunch_configs, &num_crunch_configs_main,
+                      &red_and_blue_always_zero) ||
+      !EncoderInit(enc_main) || !VP8LBitWriterInit(&bw_side, 0)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
   }
 
- Error:
-  VP8LEncoderDelete(enc);
+  // Split the configs between the main and side threads (if any).
+  if (config->thread_level > 0) {
+    num_crunch_configs_side = num_crunch_configs_main / 2;
+    for (idx = 0; idx < num_crunch_configs_side; ++idx) {
+      params_side.crunch_configs_[idx] =
+          crunch_configs[num_crunch_configs_main - num_crunch_configs_side +
+                         idx];
+    }
+    params_side.num_crunch_configs_ = num_crunch_configs_side;
+  }
+  num_crunch_configs_main -= num_crunch_configs_side;
+  for (idx = 0; idx < num_crunch_configs_main; ++idx) {
+    params_main.crunch_configs_[idx] = crunch_configs[idx];
+  }
+  params_main.num_crunch_configs_ = num_crunch_configs_main;
+
+  // Fill in the parameters for the thread workers.
+  {
+    const int params_size = (num_crunch_configs_side > 0) ? 2 : 1;
+    for (idx = 0; idx < params_size; ++idx) {
+      // Create the parameters for each worker.
+      WebPWorker* const worker = (idx == 0) ? &worker_main : &worker_side;
+      StreamEncodeContext* const param =
+          (idx == 0) ? &params_main : &params_side;
+      param->config_ = config;
+      param->picture_ = picture;
+      param->use_cache_ = use_cache;
+      param->red_and_blue_always_zero_ = red_and_blue_always_zero;
+      if (idx == 0) {
+        param->stats_ = picture->stats;
+        param->bw_ = bw_main;
+        param->enc_ = enc_main;
+      } else {
+        param->stats_ = (picture->stats == NULL) ? NULL : &stats_side;
+        // Create a side bit writer.
+        if (!VP8LBitWriterClone(bw_main, &bw_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        param->bw_ = &bw_side;
+        // Create a side encoder.
+        enc_side = VP8LEncoderNew(config, picture);
+        if (enc_side == NULL || !EncoderInit(enc_side)) {
+          err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+          goto Error;
+        }
+        // Copy the values that were computed for the main encoder.
+        enc_side->histo_bits_ = enc_main->histo_bits_;
+        enc_side->transform_bits_ = enc_main->transform_bits_;
+        enc_side->palette_size_ = enc_main->palette_size_;
+        memcpy(enc_side->palette_, enc_main->palette_,
+               sizeof(enc_main->palette_));
+        param->enc_ = enc_side;
+      }
+      // Create the workers.
+      worker_interface->Init(worker);
+      worker->data1 = param;
+      worker->data2 = NULL;
+      worker->hook = (WebPWorkerHook)EncodeStreamHook;
+    }
+  }
+
+  // Start the second thread if needed.
+  if (num_crunch_configs_side != 0) {
+    if (!worker_interface->Reset(&worker_side)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+#if !defined(WEBP_DISABLE_STATS)
+    // This line is here and not in the param initialization above to remove a
+    // Clang static analyzer warning.
+    if (picture->stats != NULL) {
+      memcpy(&stats_side, picture->stats, sizeof(stats_side));
+    }
+#endif
+    // This line is only useful to remove a Clang static analyzer warning.
+    params_side.err_ = VP8_ENC_OK;
+    worker_interface->Launch(&worker_side);
+  }
+  // Execute the main thread.
+  worker_interface->Execute(&worker_main);
+  ok_main = worker_interface->Sync(&worker_main);
+  worker_interface->End(&worker_main);
+  if (num_crunch_configs_side != 0) {
+    // Wait for the second thread.
+    const int ok_side = worker_interface->Sync(&worker_side);
+    worker_interface->End(&worker_side);
+    if (!ok_main || !ok_side) {
+      err = ok_main ? params_side.err_ : params_main.err_;
+      goto Error;
+    }
+    if (VP8LBitWriterNumBytes(&bw_side) < VP8LBitWriterNumBytes(bw_main)) {
+      VP8LBitWriterSwap(bw_main, &bw_side);
+#if !defined(WEBP_DISABLE_STATS)
+      if (picture->stats != NULL) {
+        memcpy(picture->stats, &stats_side, sizeof(*picture->stats));
+      }
+#endif
+    }
+  } else {
+    if (!ok_main) {
+      err = params_main.err_;
+      goto Error;
+    }
+  }
+
+Error:
+  VP8LBitWriterWipeOut(&bw_side);
+  VP8LEncoderDelete(enc_main);
+  VP8LEncoderDelete(enc_side);
   return err;
 }
 
+#undef CRUNCH_CONFIGS_MAX
+#undef CRUNCH_CONFIGS_LZ77_MAX
+
 int VP8LEncodeImage(const WebPConfig* const config,
                     const WebPPicture* const picture) {
   int width, height;
@@ -1642,11 +1953,13 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   if (!WebPReportProgress(picture, 100, &percent)) goto UserAbort;
 
+#if !defined(WEBP_DISABLE_STATS)
   // Save size.
   if (picture->stats != NULL) {
     picture->stats->coded_size += (int)coded_size;
     picture->stats->lossless_size = (int)coded_size;
   }
+#endif
 
   if (picture->extra_info != NULL) {
     const int mb_w = (width + 15) >> 4;
diff --git a/thirdparty/libwebp/enc/vp8li_enc.h b/thirdparty/libwebp/src/enc/vp8li_enc.h
index 8c5fbcbb2e..298a4a0014 100644
--- a/thirdparty/libwebp/enc/vp8li_enc.h
+++ b/thirdparty/libwebp/src/enc/vp8li_enc.h
@@ -11,14 +11,23 @@
 //
 // Author: Vikas Arora (vikaas.arora@gmail.com)
 
-#ifndef WEBP_ENC_VP8LI_H_
-#define WEBP_ENC_VP8LI_H_
+#ifndef WEBP_ENC_VP8LI_ENC_H_
+#define WEBP_ENC_VP8LI_ENC_H_
 
-#include "./backward_references_enc.h"
-#include "./histogram_enc.h"
-#include "../utils/bit_writer_utils.h"
-#include "../webp/encode.h"
-#include "../webp/format_constants.h"
+#ifdef HAVE_CONFIG_H
+#include "src/webp/config.h"
+#endif
+// Either WEBP_NEAR_LOSSLESS is defined as 0 in config.h when compiling to
+// disable near-lossless, or it is enabled by default.
+#ifndef WEBP_NEAR_LOSSLESS
+#define WEBP_NEAR_LOSSLESS 1
+#endif
+
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,16 +36,24 @@ extern "C" {
 // maximum value of transform_bits_ in VP8LEncoder.
 #define MAX_TRANSFORM_BITS 6
 
+typedef enum {
+  kEncoderNone = 0,
+  kEncoderARGB,
+  kEncoderNearLossless,
+  kEncoderPalette
+} VP8LEncoderARGBContent;
+
 typedef struct {
   const WebPConfig* config_;      // user configuration and parameters
   const WebPPicture* pic_;        // input picture.
 
-  uint32_t* argb_;                // Transformed argb image data.
-  uint32_t* argb_scratch_;        // Scratch memory for argb rows
-                                  // (used for prediction).
-  uint32_t* transform_data_;      // Scratch memory for transform data.
-  uint32_t* transform_mem_;       // Currently allocated memory.
-  size_t    transform_mem_size_;  // Currently allocated memory size.
+  uint32_t* argb_;                       // Transformed argb image data.
+  VP8LEncoderARGBContent argb_content_;  // Content type of the argb buffer.
+  uint32_t* argb_scratch_;               // Scratch memory for argb rows
+                                         // (used for prediction).
+  uint32_t* transform_data_;             // Scratch memory for transform data.
+  uint32_t* transform_mem_;              // Currently allocated memory.
+  size_t    transform_mem_size_;         // Currently allocated memory size.
 
   int       current_width_;       // Corresponds to packed image width.
 
@@ -54,8 +71,7 @@ typedef struct {
   uint32_t palette_[MAX_PALETTE_SIZE];
 
   // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[2];  // Backward Refs array corresponding to
-                                     // LZ77 & RLE coding.
+  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
   VP8LHashChain hash_chain_;         // HashChain data for constructing
                                      // backward references.
 } VP8LEncoder;
@@ -75,6 +91,13 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    const WebPPicture* const picture,
                                    VP8LBitWriter* const bw, int use_cache);
 
+#if (WEBP_NEAR_LOSSLESS == 1)
+// in near_lossless.c
+// Near lossless preprocessing in RGB color-space.
+int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
+                         uint32_t* const argb_dst);
+#endif
+
 //------------------------------------------------------------------------------
 // Image transforms in predictor.c.
 
@@ -92,4 +115,4 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_ENC_VP8LI_H_ */
+#endif  /* WEBP_ENC_VP8LI_ENC_H_ */
diff --git a/thirdparty/libwebp/enc/webp_enc.c b/thirdparty/libwebp/src/enc/webp_enc.c
index f18461ef92..283cda8e7b 100644
--- a/thirdparty/libwebp/enc/webp_enc.c
+++ b/thirdparty/libwebp/src/enc/webp_enc.c
@@ -16,10 +16,10 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost_enc.h"
-#include "./vp8i_enc.h"
-#include "./vp8li_enc.h"
-#include "../utils/utils.h"
+#include "src/enc/cost_enc.h"
+#include "src/enc/vp8i_enc.h"
+#include "src/enc/vp8li_enc.h"
+#include "src/utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
 
@@ -207,7 +207,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->preds_w_ = preds_w;
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
-  enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
+  enc->preds_ = mem + 1 + enc->preds_w_;
   mem += preds_size;
   enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
   mem += nz_size;
@@ -216,7 +216,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
 
   // top samples (all 16-aligned)
   mem = (uint8_t*)WEBP_ALIGN(mem);
-  enc->y_top_ = (uint8_t*)mem;
+  enc->y_top_ = mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
   assert(mem <= (uint8_t*)enc + size);
@@ -256,6 +256,7 @@ static int DeleteVP8Encoder(VP8Encoder* enc) {
 
 //------------------------------------------------------------------------------
 
+#if !defined(WEBP_DISABLE_STATS)
 static double GetPSNR(uint64_t err, uint64_t size) {
   return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
 }
@@ -270,8 +271,10 @@ static void FinalizePSNR(const VP8Encoder* const enc) {
   stats->PSNR[3] = (float)GetPSNR(sse[0] + sse[1] + sse[2], size * 3 / 2);
   stats->PSNR[4] = (float)GetPSNR(sse[3], size);
 }
+#endif  // !defined(WEBP_DISABLE_STATS)
 
 static void StoreStats(VP8Encoder* const enc) {
+#if !defined(WEBP_DISABLE_STATS)
   WebPAuxStats* const stats = enc->pic_->stats;
   if (stats != NULL) {
     int i, s;
@@ -288,7 +291,9 @@ static void StoreStats(VP8Encoder* const enc) {
       stats->block_count[i] = enc->block_count_[i];
     }
   }
+#else  // defined(WEBP_DISABLE_STATS)
   WebPReportProgress(enc->pic_, 100, &enc->percent_);  // done!
+#endif  // !defined(WEBP_DISABLE_STATS)
 }
 
 int WebPEncodingSetError(const WebPPicture* const pic,
@@ -336,10 +341,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
 
-    if (!config->exact) {
-      WebPCleanupTransparentArea(pic);
-    }
-
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
       if (config->use_sharp_yuv || (config->preprocessing & 4)) {
@@ -361,6 +362,10 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
       }
     }
 
+    if (!config->exact) {
+      WebPCleanupTransparentArea(pic);
+    }
+
     enc = InitVP8Encoder(config, pic);
     if (enc == NULL) return 0;  // pic->error is already set.
     // Note: each of the tasks below account for 20% in the progress report.
diff --git a/thirdparty/libwebp/mux/anim_encode.c b/thirdparty/libwebp/src/mux/anim_encode.c
index 6066388727..7be99068f6 100644
--- a/thirdparty/libwebp/mux/anim_encode.c
+++ b/thirdparty/libwebp/src/mux/anim_encode.c
@@ -16,12 +16,12 @@
 #include <stdio.h>
 #include <stdlib.h>  // for abs()
 
-#include "../mux/animi.h"
-#include "../utils/utils.h"
-#include "../webp/decode.h"
-#include "../webp/encode.h"
-#include "../webp/format_constants.h"
-#include "../webp/mux.h"
+#include "src/mux/animi.h"
+#include "src/utils/utils.h"
+#include "src/webp/decode.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/mux.h"
 
 #if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
@@ -35,7 +35,7 @@
 // Stores frame rectangle dimensions.
 typedef struct {
   int x_offset_, y_offset_, width_, height_;
-} FrameRect;
+} FrameRectangle;
 
 // Used to store two candidates of encoded data for an animation frame. One of
 // the two will be chosen later.
@@ -50,7 +50,7 @@ struct WebPAnimEncoder {
   const int canvas_height_;                 // Canvas height.
   const WebPAnimEncoderOptions options_;    // Global encoding options.
 
-  FrameRect prev_rect_;               // Previous WebP frame rectangle.
+  FrameRectangle prev_rect_;          // Previous WebP frame rectangle.
   WebPConfig last_config_;            // Cached in case a re-encode is needed.
   WebPConfig last_config_reversed_;   // If 'last_config_' uses lossless, then
                                       // this config uses lossy and vice versa;
@@ -206,7 +206,7 @@ static void ClearRectangle(WebPPicture* const picture,
 }
 
 static void WebPUtilClearPic(WebPPicture* const picture,
-                             const FrameRect* const rect) {
+                             const FrameRectangle* const rect) {
   if (rect != NULL) {
     ClearRectangle(picture, rect->x_offset_, rect->y_offset_,
                    rect->width_, rect->height_);
@@ -400,7 +400,7 @@ static WEBP_INLINE int ComparePixelsLossy(const uint32_t* src, int src_step,
   return 1;
 }
 
-static int IsEmptyRect(const FrameRect* const rect) {
+static int IsEmptyRect(const FrameRectangle* const rect) {
   return (rect->width_ == 0) || (rect->height_ == 0);
 }
 
@@ -413,7 +413,7 @@ static int QualityToMaxDiff(float quality) {
 // Assumes that an initial valid guess of change rectangle 'rect' is passed.
 static void MinimizeChangeRectangle(const WebPPicture* const src,
                                     const WebPPicture* const dst,
-                                    FrameRect* const rect,
+                                    FrameRectangle* const rect,
                                     int is_lossless, float quality) {
   int i, j;
   const ComparePixelsFunc compare_pixels =
@@ -498,7 +498,7 @@ static void MinimizeChangeRectangle(const WebPPicture* const src,
 }
 
 // Snap rectangle to even offsets (and adjust dimensions if needed).
-static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) {
+static WEBP_INLINE void SnapToEvenOffsets(FrameRectangle* const rect) {
   rect->width_ += (rect->x_offset_ & 1);
   rect->height_ += (rect->y_offset_ & 1);
   rect->x_offset_ &= ~1;
@@ -508,9 +508,9 @@ static WEBP_INLINE void SnapToEvenOffsets(FrameRect* const rect) {
 typedef struct {
   int should_try_;               // Should try this set of parameters.
   int empty_rect_allowed_;       // Frame with empty rectangle can be skipped.
-  FrameRect rect_ll_;            // Frame rectangle for lossless compression.
+  FrameRectangle rect_ll_;       // Frame rectangle for lossless compression.
   WebPPicture sub_frame_ll_;     // Sub-frame pic for lossless compression.
-  FrameRect rect_lossy_;         // Frame rectangle for lossy compression.
+  FrameRectangle rect_lossy_;    // Frame rectangle for lossy compression.
                                  // Could be smaller than rect_ll_ as pixels
                                  // with small diffs can be ignored.
   WebPPicture sub_frame_lossy_;  // Sub-frame pic for lossless compression.
@@ -538,7 +538,8 @@ static void SubFrameParamsFree(SubFrameParams* const params) {
 static int GetSubRect(const WebPPicture* const prev_canvas,
                       const WebPPicture* const curr_canvas, int is_key_frame,
                       int is_first_frame, int empty_rect_allowed,
-                      int is_lossless, float quality, FrameRect* const rect,
+                      int is_lossless, float quality,
+                      FrameRectangle* const rect,
                       WebPPicture* const sub_frame) {
   if (!is_key_frame || is_first_frame) {  // Optimize frame rectangle.
     // Note: This behaves as expected for first frame, as 'prev_canvas' is
@@ -594,7 +595,7 @@ int WebPAnimEncoderRefineRect(
     const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas,
     int is_lossless, float quality, int* const x_offset, int* const y_offset,
     int* const width, int* const height) {
-  FrameRect rect;
+  FrameRectangle rect;
   const int right = clip(*x_offset + *width, 0, curr_canvas->width);
   const int left = clip(*x_offset, 0, curr_canvas->width - 1);
   const int bottom = clip(*y_offset + *height, 0, curr_canvas->height);
@@ -620,7 +621,7 @@ int WebPAnimEncoderRefineRect(
 }
 
 static void DisposeFrameRectangle(int dispose_method,
-                                  const FrameRect* const rect,
+                                  const FrameRectangle* const rect,
                                   WebPPicture* const curr_canvas) {
   assert(rect != NULL);
   if (dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
@@ -628,13 +629,13 @@ static void DisposeFrameRectangle(int dispose_method,
   }
 }
 
-static uint32_t RectArea(const FrameRect* const rect) {
+static uint32_t RectArea(const FrameRectangle* const rect) {
   return (uint32_t)rect->width_ * rect->height_;
 }
 
 static int IsLosslessBlendingPossible(const WebPPicture* const src,
                                       const WebPPicture* const dst,
-                                      const FrameRect* const rect) {
+                                      const FrameRectangle* const rect) {
   int i, j;
   assert(src->width == dst->width && src->height == dst->height);
   assert(rect->x_offset_ + rect->width_ <= dst->width);
@@ -656,7 +657,7 @@ static int IsLosslessBlendingPossible(const WebPPicture* const src,
 
 static int IsLossyBlendingPossible(const WebPPicture* const src,
                                    const WebPPicture* const dst,
-                                   const FrameRect* const rect,
+                                   const FrameRectangle* const rect,
                                    float quality) {
   const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
   int i, j;
@@ -683,7 +684,7 @@ static int IsLossyBlendingPossible(const WebPPicture* const src,
 // transparent pixels.
 // Returns true if at least one pixel gets modified.
 static int IncreaseTransparency(const WebPPicture* const src,
-                                const FrameRect* const rect,
+                                const FrameRectangle* const rect,
                                 WebPPicture* const dst) {
   int i, j;
   int modified = 0;
@@ -709,7 +710,7 @@ static int IncreaseTransparency(const WebPPicture* const src,
 // Assumes lossy compression is being used.
 // Returns true if at least one pixel gets modified.
 static int FlattenSimilarBlocks(const WebPPicture* const src,
-                                const FrameRect* const rect,
+                                const FrameRectangle* const rect,
                                 WebPPicture* const dst, float quality) {
   const int max_allowed_diff_lossy = QualityToMaxDiff(quality);
   int i, j;
@@ -778,13 +779,13 @@ static int EncodeFrame(const WebPConfig* const config, WebPPicture* const pic,
 typedef struct {
   WebPMemoryWriter  mem_;
   WebPMuxFrameInfo  info_;
-  FrameRect         rect_;
+  FrameRectangle    rect_;
   int               evaluate_;  // True if this candidate should be evaluated.
 } Candidate;
 
 // Generates a candidate encoded frame given a picture and metadata.
 static WebPEncodingError EncodeCandidate(WebPPicture* const sub_frame,
-                                         const FrameRect* const rect,
+                                         const FrameRectangle* const rect,
                                          const WebPConfig* const encoder_config,
                                          int use_blending,
                                          Candidate* const candidate) {
@@ -958,7 +959,7 @@ static int IncreasePreviousDuration(WebPAnimEncoder* const enc, int duration) {
   if (new_duration >= MAX_DURATION) {  // Special case.
     // Separate out previous frame from earlier merged frames to avoid overflow.
     // We add a 1x1 transparent frame for the previous frame, with blending on.
-    const FrameRect rect = { 0, 0, 1, 1 };
+    const FrameRectangle rect = { 0, 0, 1, 1 };
     const uint8_t lossless_1x1_bytes[] = {
       0x52, 0x49, 0x46, 0x46, 0x14, 0x00, 0x00, 0x00, 0x57, 0x45, 0x42, 0x50,
       0x56, 0x50, 0x38, 0x4c, 0x08, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
@@ -1223,7 +1224,7 @@ static int CacheFrame(WebPAnimEncoder* const enc,
       enc->prev_candidate_undecided_ = 0;
     } else {
       int64_t curr_delta;
-      FrameRect prev_rect_key, prev_rect_sub;
+      FrameRectangle prev_rect_key, prev_rect_sub;
 
       // Add this as a frame rectangle to enc.
       error_code = SetFrame(enc, config, 0, encoded_frame, &frame_skipped);
@@ -1535,7 +1536,8 @@ int WebPAnimEncoderAssemble(WebPAnimEncoder* enc, WebPData* webp_data) {
 
   if (!enc->got_null_frame_ && enc->in_frame_count_ > 1 && enc->count_ > 0) {
     // set duration of the last frame to be avg of durations of previous frames.
-    const double delta_time = enc->prev_timestamp_ - enc->first_timestamp_;
+    const double delta_time =
+        (uint32_t)enc->prev_timestamp_ - enc->first_timestamp_;
     const int average_duration = (int)(delta_time / (enc->in_frame_count_ - 1));
     if (!IncreasePreviousDuration(enc, average_duration)) {
       return 0;
diff --git a/thirdparty/libwebp/mux/animi.h b/thirdparty/libwebp/src/mux/animi.h
index cecaf1fee5..88899532aa 100644
--- a/thirdparty/libwebp/mux/animi.h
+++ b/thirdparty/libwebp/src/mux/animi.h
@@ -14,7 +14,7 @@
 #ifndef WEBP_MUX_ANIMI_H_
 #define WEBP_MUX_ANIMI_H_
 
-#include "../webp/mux.h"
+#include "src/webp/mux.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/thirdparty/libwebp/mux/muxedit.c b/thirdparty/libwebp/src/mux/muxedit.c
index d2c5305372..7a027b3cb4 100644
--- a/thirdparty/libwebp/mux/muxedit.c
+++ b/thirdparty/libwebp/src/mux/muxedit.c
@@ -13,8 +13,8 @@
 //          Vikas (vikasa@google.com)
 
 #include <assert.h>
-#include "./muxi.h"
-#include "../utils/utils.h"
+#include "src/mux/muxi.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Life of a mux object.
diff --git a/thirdparty/libwebp/mux/muxi.h b/thirdparty/libwebp/src/mux/muxi.h
index e6606aa5d1..b73e3fbd7a 100644
--- a/thirdparty/libwebp/mux/muxi.h
+++ b/thirdparty/libwebp/src/mux/muxi.h
@@ -15,9 +15,9 @@
 #define WEBP_MUX_MUXI_H_
 
 #include <stdlib.h>
-#include "../dec/vp8i_dec.h"
-#include "../dec/vp8li_dec.h"
-#include "../webp/mux.h"
+#include "src/dec/vp8i_dec.h"
+#include "src/dec/vp8li_dec.h"
+#include "src/webp/mux.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,7 +28,7 @@ extern "C" {
 
 #define MUX_MAJ_VERSION 0
 #define MUX_MIN_VERSION 4
-#define MUX_REV_VERSION 0
+#define MUX_REV_VERSION 1
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
diff --git a/thirdparty/libwebp/mux/muxinternal.c b/thirdparty/libwebp/src/mux/muxinternal.c
index 387b57e8fe..1473f100e5 100644
--- a/thirdparty/libwebp/mux/muxinternal.c
+++ b/thirdparty/libwebp/src/mux/muxinternal.c
@@ -13,8 +13,8 @@
 //          Vikas (vikasa@google.com)
 
 #include <assert.h>
-#include "./muxi.h"
-#include "../utils/utils.h"
+#include "src/mux/muxi.h"
+#include "src/utils/utils.h"
 
 #define UNDEFINED_CHUNK_SIZE ((uint32_t)(-1))
 
@@ -504,6 +504,20 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
     if (!has_animation && (num_anim == 1 || num_frames > 0)) {
       return WEBP_MUX_INVALID_ARGUMENT;
     }
+    if (!has_animation) {
+      const WebPMuxImage* images = mux->images_;
+      // There can be only one image.
+      if (images == NULL || images->next_ != NULL) {
+        return WEBP_MUX_INVALID_ARGUMENT;
+      }
+      // Size must match.
+      if (mux->canvas_width_ > 0) {
+        if (images->width_ != mux->canvas_width_ ||
+            images->height_ != mux->canvas_height_) {
+          return WEBP_MUX_INVALID_ARGUMENT;
+        }
+      }
+    }
   }
 
   // Verify either VP8X chunk is present OR there is only one elem in
@@ -515,6 +529,7 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
   if (num_vp8x == 0 && num_images != 1) return WEBP_MUX_INVALID_ARGUMENT;
 
   // ALPHA_FLAG & alpha chunk(s) are consistent.
+  // Note: ALPHA_FLAG can be set when there is actually no Alpha data present.
   if (MuxHasAlpha(mux->images_)) {
     if (num_vp8x > 0) {
       // VP8X chunk is present, so it should contain ALPHA_FLAG.
@@ -525,8 +540,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
       if (err != WEBP_MUX_OK) return err;
       if (num_alpha > 0) return WEBP_MUX_INVALID_ARGUMENT;
     }
-  } else {  // Mux doesn't need alpha. So, ALPHA_FLAG should NOT be present.
-    if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
   }
 
   return WEBP_MUX_OK;
diff --git a/thirdparty/libwebp/mux/muxread.c b/thirdparty/libwebp/src/mux/muxread.c
index 410acd9119..0b55286862 100644
--- a/thirdparty/libwebp/mux/muxread.c
+++ b/thirdparty/libwebp/src/mux/muxread.c
@@ -13,8 +13,8 @@
 //          Vikas (vikasa@google.com)
 
 #include <assert.h>
-#include "./muxi.h"
-#include "../utils/utils.h"
+#include "src/mux/muxi.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Helper method(s).
@@ -43,7 +43,7 @@ static WebPMuxError MuxGet(const WebPMux* const mux, CHUNK_INDEX idx,
   SWITCH_ID_LIST(IDX_ANIM, mux->anim_);
   SWITCH_ID_LIST(IDX_EXIF, mux->exif_);
   SWITCH_ID_LIST(IDX_XMP, mux->xmp_);
-  SWITCH_ID_LIST(IDX_UNKNOWN, mux->unknown_);
+  assert(idx != IDX_UNKNOWN);
   return WEBP_MUX_NOT_FOUND;
 }
 #undef SWITCH_ID_LIST
@@ -270,6 +270,9 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
     ChunkInit(&chunk);
   }
 
+  // Incomplete image.
+  if (wpi->is_partial_) goto Err;
+
   // Validate mux if complete.
   if (MuxValidate(mux) != WEBP_MUX_OK) goto Err;
 
diff --git a/thirdparty/libwebp/utils/bit_reader_inl_utils.h b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h
index fd7fb0446c..2ccc6ed326 100644
--- a/thirdparty/libwebp/utils/bit_reader_inl_utils.h
+++ b/thirdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -13,19 +13,19 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_BIT_READER_INL_H_
-#define WEBP_UTILS_BIT_READER_INL_H_
+#ifndef WEBP_UTILS_BIT_READER_INL_UTILS_H_
+#define WEBP_UTILS_BIT_READER_INL_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <string.h>  // for memcpy
 
-#include "../dsp/dsp.h"
-#include "./bit_reader_utils.h"
-#include "./endian_inl_utils.h"
-#include "./utils.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/bit_reader_utils.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -187,4 +187,4 @@ static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
 }    // extern "C"
 #endif
 
-#endif   // WEBP_UTILS_BIT_READER_INL_H_
+#endif   // WEBP_UTILS_BIT_READER_INL_UTILS_H_
diff --git a/thirdparty/libwebp/utils/bit_reader_utils.c b/thirdparty/libwebp/src/utils/bit_reader_utils.c
index 053b710bb8..5fa3ae7795 100644
--- a/thirdparty/libwebp/utils/bit_reader_utils.c
+++ b/thirdparty/libwebp/src/utils/bit_reader_utils.c
@@ -12,11 +12,11 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "./bit_reader_inl_utils.h"
-#include "../utils/utils.h"
+#include "src/utils/bit_reader_inl_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // VP8BitReader
diff --git a/thirdparty/libwebp/utils/bit_reader_utils.h b/thirdparty/libwebp/src/utils/bit_reader_utils.h
index ea5c584eb4..04f9804409 100644
--- a/thirdparty/libwebp/utils/bit_reader_utils.h
+++ b/thirdparty/libwebp/src/utils/bit_reader_utils.h
@@ -12,14 +12,14 @@
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora (vikaas.arora@gmail.com)
 
-#ifndef WEBP_UTILS_BIT_READER_H_
-#define WEBP_UTILS_BIT_READER_H_
+#ifndef WEBP_UTILS_BIT_READER_UTILS_H_
+#define WEBP_UTILS_BIT_READER_UTILS_H_
 
 #include <assert.h>
 #ifdef _MSC_VER
 #include <stdlib.h>  // _byteswap_ulong
 #endif
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -165,9 +165,10 @@ static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
 
 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
+// This function does *not* set br->eos_, since it's speed-critical.
+// Use with extreme care!
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
   br->bit_pos_ = val;
-  br->eos_ = VP8LIsEndOfStream(br);
 }
 
 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
@@ -181,4 +182,4 @@ static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_BIT_READER_H_ */
+#endif  /* WEBP_UTILS_BIT_READER_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/bit_writer_utils.c b/thirdparty/libwebp/src/utils/bit_writer_utils.c
index ab0c49dce8..f4f476ce3f 100644
--- a/thirdparty/libwebp/utils/bit_writer_utils.c
+++ b/thirdparty/libwebp/src/utils/bit_writer_utils.c
@@ -16,9 +16,9 @@
 #include <string.h>   // for memcpy()
 #include <stdlib.h>
 
-#include "./bit_writer_utils.h"
-#include "./endian_inl_utils.h"
-#include "./utils.h"
+#include "src/utils/bit_writer_utils.h"
+#include "src/utils/endian_inl_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // VP8BitWriter
@@ -239,6 +239,18 @@ int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size) {
   return VP8LBitWriterResize(bw, expected_size);
 }
 
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst) {
+  const size_t current_size = src->cur_ - src->buf_;
+  assert(src->cur_ >= src->buf_ && src->cur_ <= src->end_);
+  if (!VP8LBitWriterResize(dst, current_size)) return 0;
+  memcpy(dst->buf_, src->buf_, current_size);
+  dst->bits_ = src->bits_;
+  dst->used_ = src->used_;
+  dst->error_ = src->error_;
+  return 1;
+}
+
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
   if (bw != NULL) {
     WebPSafeFree(bw->buf_);
@@ -246,6 +258,21 @@ void VP8LBitWriterWipeOut(VP8LBitWriter* const bw) {
   }
 }
 
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw) {
+  bw->bits_ = bw_init->bits_;
+  bw->used_ = bw_init->used_;
+  bw->cur_ = bw->buf_ + (bw_init->cur_ - bw_init->buf_);
+  assert(bw->cur_ <= bw->end_);
+  bw->error_ = bw_init->error_;
+}
+
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst) {
+  const VP8LBitWriter tmp = *src;
+  *src = *dst;
+  *dst = tmp;
+}
+
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw) {
   // If needed, make some room by flushing some bits out.
   if (bw->cur_ + VP8L_WRITER_BYTES > bw->end_) {
diff --git a/thirdparty/libwebp/utils/bit_writer_utils.h b/thirdparty/libwebp/src/utils/bit_writer_utils.h
index 9c02bbc06d..2cf5976fe3 100644
--- a/thirdparty/libwebp/utils/bit_writer_utils.h
+++ b/thirdparty/libwebp/src/utils/bit_writer_utils.h
@@ -11,10 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_BIT_WRITER_H_
-#define WEBP_UTILS_BIT_WRITER_H_
+#ifndef WEBP_UTILS_BIT_WRITER_UTILS_H_
+#define WEBP_UTILS_BIT_WRITER_UTILS_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -100,16 +100,24 @@ typedef struct {
   int error_;
 } VP8LBitWriter;
 
-static WEBP_INLINE size_t VP8LBitWriterNumBytes(VP8LBitWriter* const bw) {
+static WEBP_INLINE size_t VP8LBitWriterNumBytes(const VP8LBitWriter* const bw) {
   return (bw->cur_ - bw->buf_) + ((bw->used_ + 7) >> 3);
 }
 
 // Returns false in case of memory allocation error.
 int VP8LBitWriterInit(VP8LBitWriter* const bw, size_t expected_size);
+// Returns false in case of memory allocation error.
+int VP8LBitWriterClone(const VP8LBitWriter* const src,
+                       VP8LBitWriter* const dst);
 // Finalize the bitstream coding. Returns a pointer to the internal buffer.
 uint8_t* VP8LBitWriterFinish(VP8LBitWriter* const bw);
 // Release any pending memory and zeroes the object.
 void VP8LBitWriterWipeOut(VP8LBitWriter* const bw);
+// Resets the cursor of the BitWriter bw to when it was like in bw_init.
+void VP8LBitWriterReset(const VP8LBitWriter* const bw_init,
+                        VP8LBitWriter* const bw);
+// Swaps the memory held by two BitWriters.
+void VP8LBitWriterSwap(VP8LBitWriter* const src, VP8LBitWriter* const dst);
 
 // Internal function for VP8LPutBits flushing 32 bits from the written state.
 void VP8LPutBitsFlushBits(VP8LBitWriter* const bw);
@@ -143,4 +151,4 @@ static WEBP_INLINE void VP8LPutBits(VP8LBitWriter* const bw,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_BIT_WRITER_H_ */
+#endif  /* WEBP_UTILS_BIT_WRITER_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/color_cache_utils.c b/thirdparty/libwebp/src/utils/color_cache_utils.c
index 0172590c48..b09f538e8b 100644
--- a/thirdparty/libwebp/utils/color_cache_utils.c
+++ b/thirdparty/libwebp/src/utils/color_cache_utils.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./color_cache_utils.h"
-#include "./utils.h"
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
 
 //------------------------------------------------------------------------------
 // VP8LColorCache.
diff --git a/thirdparty/libwebp/utils/color_cache_utils.h b/thirdparty/libwebp/src/utils/color_cache_utils.h
index c373e6b361..20b7be11c9 100644
--- a/thirdparty/libwebp/utils/color_cache_utils.h
+++ b/thirdparty/libwebp/src/utils/color_cache_utils.h
@@ -12,10 +12,12 @@
 // Authors: Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 
-#ifndef WEBP_UTILS_COLOR_CACHE_H_
-#define WEBP_UTILS_COLOR_CACHE_H_
+#ifndef WEBP_UTILS_COLOR_CACHE_UTILS_H_
+#define WEBP_UTILS_COLOR_CACHE_UTILS_H_
 
-#include "../webp/types.h"
+#include <assert.h>
+
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,7 +32,7 @@ typedef struct {
 
 static const uint64_t kHashMul = 0x1e35a7bdull;
 
-static WEBP_INLINE int HashPix(uint32_t argb, int shift) {
+static WEBP_INLINE int VP8LHashPix(uint32_t argb, int shift) {
   return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
 }
 
@@ -48,19 +50,19 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,
 
 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  const int key = HashPix(argb, cc->hash_shift_);
+  const int key = VP8LHashPix(argb, cc->hash_shift_);
   cc->colors_[key] = argb;
 }
 
 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  return HashPix(argb, cc->hash_shift_);
+  return VP8LHashPix(argb, cc->hash_shift_);
 }
 
 // Return the key if cc contains argb, and -1 otherwise.
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  const int key = HashPix(argb, cc->hash_shift_);
+  const int key = VP8LHashPix(argb, cc->hash_shift_);
   return (cc->colors_[key] == argb) ? key : -1;
 }
 
@@ -82,4 +84,4 @@ void VP8LColorCacheClear(VP8LColorCache* const color_cache);
 }
 #endif
 
-#endif  // WEBP_UTILS_COLOR_CACHE_H_
+#endif  // WEBP_UTILS_COLOR_CACHE_UTILS_H_
diff --git a/thirdparty/libwebp/utils/endian_inl_utils.h b/thirdparty/libwebp/src/utils/endian_inl_utils.h
index e11260ff7d..4b2f91dfb8 100644
--- a/thirdparty/libwebp/utils/endian_inl_utils.h
+++ b/thirdparty/libwebp/src/utils/endian_inl_utils.h
@@ -9,15 +9,15 @@
 //
 // Endian related functions.
 
-#ifndef WEBP_UTILS_ENDIAN_INL_H_
-#define WEBP_UTILS_ENDIAN_INL_H_
+#ifndef WEBP_UTILS_ENDIAN_INL_UTILS_H_
+#define WEBP_UTILS_ENDIAN_INL_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../dsp/dsp.h"
-#include "../webp/types.h"
+#include "src/dsp/dsp.h"
+#include "src/webp/types.h"
 
 // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
 #if !defined(WORDS_BIGENDIAN) && \
@@ -97,4 +97,4 @@ static WEBP_INLINE uint64_t BSwap64(uint64_t x) {
 #endif  // HAVE_BUILTIN_BSWAP64
 }
 
-#endif  // WEBP_UTILS_ENDIAN_INL_H_
+#endif  // WEBP_UTILS_ENDIAN_INL_UTILS_H_
diff --git a/thirdparty/libwebp/utils/filters_utils.c b/thirdparty/libwebp/src/utils/filters_utils.c
index 49c1d18a22..bbc2c34d93 100644
--- a/thirdparty/libwebp/utils/filters_utils.c
+++ b/thirdparty/libwebp/src/utils/filters_utils.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#include "./filters_utils.h"
+#include "src/utils/filters_utils.h"
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/thirdparty/libwebp/utils/filters_utils.h b/thirdparty/libwebp/src/utils/filters_utils.h
index 088b132fc5..410f2fcdf2 100644
--- a/thirdparty/libwebp/utils/filters_utils.h
+++ b/thirdparty/libwebp/src/utils/filters_utils.h
@@ -11,11 +11,11 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#ifndef WEBP_UTILS_FILTERS_H_
-#define WEBP_UTILS_FILTERS_H_
+#ifndef WEBP_UTILS_FILTERS_UTILS_H_
+#define WEBP_UTILS_FILTERS_UTILS_H_
 
-#include "../webp/types.h"
-#include "../dsp/dsp.h"
+#include "src/webp/types.h"
+#include "src/dsp/dsp.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,4 +29,4 @@ WEBP_FILTER_TYPE WebPEstimateBestFilter(const uint8_t* data,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_FILTERS_H_ */
+#endif  /* WEBP_UTILS_FILTERS_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/huffman_encode_utils.c b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
index f9504658ea..6f3b1bbe02 100644
--- a/thirdparty/libwebp/utils/huffman_encode_utils.c
+++ b/thirdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -14,9 +14,9 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman_encode_utils.h"
-#include "./utils.h"
-#include "../webp/format_constants.h"
+#include "src/utils/huffman_encode_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
 // Util function to optimize the symbol map for RLE coding
diff --git a/thirdparty/libwebp/utils/huffman_encode_utils.h b/thirdparty/libwebp/src/utils/huffman_encode_utils.h
index a157165148..3e6763ce49 100644
--- a/thirdparty/libwebp/utils/huffman_encode_utils.h
+++ b/thirdparty/libwebp/src/utils/huffman_encode_utils.h
@@ -11,10 +11,10 @@
 //
 // Entropy encoding (Huffman) for webp lossless
 
-#ifndef WEBP_UTILS_HUFFMAN_ENCODE_H_
-#define WEBP_UTILS_HUFFMAN_ENCODE_H_
+#ifndef WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_
+#define WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -57,4 +57,4 @@ void VP8LCreateHuffmanTree(uint32_t* const histogram, int tree_depth_limit,
 }
 #endif
 
-#endif  // WEBP_UTILS_HUFFMAN_ENCODE_H_
+#endif  // WEBP_UTILS_HUFFMAN_ENCODE_UTILS_H_
diff --git a/thirdparty/libwebp/utils/huffman_utils.c b/thirdparty/libwebp/src/utils/huffman_utils.c
index 008b5d746f..7a69963c3e 100644
--- a/thirdparty/libwebp/utils/huffman_utils.c
+++ b/thirdparty/libwebp/src/utils/huffman_utils.c
@@ -14,9 +14,9 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman_utils.h"
-#include "./utils.h"
-#include "../webp/format_constants.h"
+#include "src/utils/huffman_utils.h"
+#include "src/utils/utils.h"
+#include "src/webp/format_constants.h"
 
 // Huffman data read via DecodeImageStream is represented in two (red and green)
 // bytes.
diff --git a/thirdparty/libwebp/utils/huffman_utils.h b/thirdparty/libwebp/src/utils/huffman_utils.h
index c6dd6aaa45..ff7ef17f3b 100644
--- a/thirdparty/libwebp/utils/huffman_utils.h
+++ b/thirdparty/libwebp/src/utils/huffman_utils.h
@@ -11,12 +11,12 @@
 //
 // Author: Urvang Joshi (urvang@google.com)
 
-#ifndef WEBP_UTILS_HUFFMAN_H_
-#define WEBP_UTILS_HUFFMAN_H_
+#ifndef WEBP_UTILS_HUFFMAN_UTILS_H_
+#define WEBP_UTILS_HUFFMAN_UTILS_H_
 
 #include <assert.h>
-#include "../webp/format_constants.h"
-#include "../webp/types.h"
+#include "src/webp/format_constants.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -85,4 +85,4 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 }    // extern "C"
 #endif
 
-#endif  // WEBP_UTILS_HUFFMAN_H_
+#endif  // WEBP_UTILS_HUFFMAN_UTILS_H_
diff --git a/thirdparty/libwebp/utils/quant_levels_dec_utils.c b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
index d4d23d3147..3818a78b93 100644
--- a/thirdparty/libwebp/utils/quant_levels_dec_utils.c
+++ b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -14,11 +14,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./quant_levels_dec_utils.h"
+#include "src/utils/quant_levels_dec_utils.h"
 
 #include <string.h>   // for memset
 
-#include "./utils.h"
+#include "src/utils/utils.h"
 
 // #define USE_DITHERING   // uncomment to enable ordered dithering (not vital)
 
@@ -71,10 +71,11 @@ typedef struct {
 
 //------------------------------------------------------------------------------
 
-#define CLIP_MASK (int)(~0U << (8 + DFIX))
+#define CLIP_8b_MASK (int)(~0U << (8 + DFIX))
 static WEBP_INLINE uint8_t clip_8b(int v) {
-  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
+  return (!(v & CLIP_8b_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
 }
+#undef CLIP_8b_MASK
 
 // vertical accumulation
 static void VFilter(SmoothParams* const p) {
diff --git a/thirdparty/libwebp/utils/quant_levels_dec_utils.h b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.h
index 59a13495d3..f822107a72 100644
--- a/thirdparty/libwebp/utils/quant_levels_dec_utils.h
+++ b/thirdparty/libwebp/src/utils/quant_levels_dec_utils.h
@@ -11,10 +11,10 @@
 //
 // Author:  Vikas Arora (vikasa@google.com)
 
-#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_H_
-#define WEBP_UTILS_QUANT_LEVELS_DEC_H_
+#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_
+#define WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,4 +32,4 @@ int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_H_ */
+#endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/quant_levels_utils.c b/thirdparty/libwebp/src/utils/quant_levels_utils.c
index 73174e8ab9..d65ad3c29d 100644
--- a/thirdparty/libwebp/utils/quant_levels_utils.c
+++ b/thirdparty/libwebp/src/utils/quant_levels_utils.c
@@ -14,7 +14,7 @@
 
 #include <assert.h>
 
-#include "./quant_levels_utils.h"
+#include "src/utils/quant_levels_utils.h"
 
 #define NUM_SYMBOLS     256
 
diff --git a/thirdparty/libwebp/utils/quant_levels_utils.h b/thirdparty/libwebp/src/utils/quant_levels_utils.h
index 1cb5a32cae..75df2ba6a4 100644
--- a/thirdparty/libwebp/utils/quant_levels_utils.h
+++ b/thirdparty/libwebp/src/utils/quant_levels_utils.h
@@ -11,12 +11,12 @@
 //
 // Author:  Vikas Arora (vikasa@google.com)
 
-#ifndef WEBP_UTILS_QUANT_LEVELS_H_
-#define WEBP_UTILS_QUANT_LEVELS_H_
+#ifndef WEBP_UTILS_QUANT_LEVELS_UTILS_H_
+#define WEBP_UTILS_QUANT_LEVELS_UTILS_H_
 
 #include <stdlib.h>
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels,
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_QUANT_LEVELS_H_ */
+#endif  /* WEBP_UTILS_QUANT_LEVELS_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/random_utils.c b/thirdparty/libwebp/src/utils/random_utils.c
index 9f1e4154a6..7edb3fefbb 100644
--- a/thirdparty/libwebp/utils/random_utils.c
+++ b/thirdparty/libwebp/src/utils/random_utils.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <string.h>
-#include "./random_utils.h"
+#include "src/utils/random_utils.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/utils/random_utils.h b/thirdparty/libwebp/src/utils/random_utils.h
index c392a615ca..6d36c667e7 100644
--- a/thirdparty/libwebp/utils/random_utils.h
+++ b/thirdparty/libwebp/src/utils/random_utils.h
@@ -11,11 +11,11 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_RANDOM_H_
-#define WEBP_UTILS_RANDOM_H_
+#ifndef WEBP_UTILS_RANDOM_UTILS_H_
+#define WEBP_UTILS_RANDOM_UTILS_H_
 
 #include <assert.h>
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -60,4 +60,4 @@ static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_RANDOM_H_ */
+#endif  /* WEBP_UTILS_RANDOM_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/rescaler_utils.c b/thirdparty/libwebp/src/utils/rescaler_utils.c
index 0d1f80da24..90e2ea76a1 100644
--- a/thirdparty/libwebp/utils/rescaler_utils.c
+++ b/thirdparty/libwebp/src/utils/rescaler_utils.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "../dsp/dsp.h"
-#include "./rescaler_utils.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -85,11 +85,13 @@ int WebPRescalerGetScaledDimensions(int src_width, int src_height,
 
     // if width is unspecified, scale original proportionally to height ratio.
     if (width == 0) {
-      width = (src_width * height + src_height / 2) / src_height;
+      width =
+          (int)(((uint64_t)src_width * height + src_height / 2) / src_height);
     }
     // if height is unspecified, scale original proportionally to width ratio.
     if (height == 0) {
-      height = (src_height * width + src_width / 2) / src_width;
+      height =
+          (int)(((uint64_t)src_height * width + src_width / 2) / src_width);
     }
     // Check if the overall dimensions still make sense.
     if (width <= 0 || height <= 0) {
diff --git a/thirdparty/libwebp/utils/rescaler_utils.h b/thirdparty/libwebp/src/utils/rescaler_utils.h
index 98b01a76d0..8890e6fa13 100644
--- a/thirdparty/libwebp/utils/rescaler_utils.h
+++ b/thirdparty/libwebp/src/utils/rescaler_utils.h
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_RESCALER_H_
-#define WEBP_UTILS_RESCALER_H_
+#ifndef WEBP_UTILS_RESCALER_UTILS_H_
+#define WEBP_UTILS_RESCALER_UTILS_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #define WEBP_RESCALER_RFIX 32   // fixed-point precision for multiplies
 #define WEBP_RESCALER_ONE (1ull << WEBP_RESCALER_RFIX)
@@ -98,4 +98,4 @@ int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_RESCALER_H_ */
+#endif  /* WEBP_UTILS_RESCALER_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/thread_utils.c b/thirdparty/libwebp/src/utils/thread_utils.c
index 1729060c70..2052b6b006 100644
--- a/thirdparty/libwebp/utils/thread_utils.c
+++ b/thirdparty/libwebp/src/utils/thread_utils.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 #include <string.h>   // for memset()
-#include "./thread_utils.h"
-#include "./utils.h"
+#include "src/utils/thread_utils.h"
+#include "src/utils/utils.h"
 
 #ifdef WEBP_USE_THREAD
 
@@ -50,11 +50,11 @@ typedef struct {
 
 #endif  // _WIN32
 
-struct WebPWorkerImpl {
+typedef struct {
   pthread_mutex_t mutex_;
   pthread_cond_t  condition_;
   pthread_t       thread_;
-};
+} WebPWorkerImpl;
 
 #if defined(_WIN32)
 
@@ -201,25 +201,24 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
 
 //------------------------------------------------------------------------------
 
-static void Execute(WebPWorker* const worker);  // Forward declaration.
-
 static THREADFN ThreadLoop(void* ptr) {
   WebPWorker* const worker = (WebPWorker*)ptr;
+  WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_;
   int done = 0;
   while (!done) {
-    pthread_mutex_lock(&worker->impl_->mutex_);
+    pthread_mutex_lock(&impl->mutex_);
     while (worker->status_ == OK) {   // wait in idling mode
-      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+      pthread_cond_wait(&impl->condition_, &impl->mutex_);
     }
     if (worker->status_ == WORK) {
-      Execute(worker);
+      WebPGetWorkerInterface()->Execute(worker);
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
     }
     // signal to the main thread that we're done (for Sync())
-    pthread_cond_signal(&worker->impl_->condition_);
-    pthread_mutex_unlock(&worker->impl_->mutex_);
+    pthread_cond_signal(&impl->condition_);
+    pthread_mutex_unlock(&impl->mutex_);
   }
   return THREAD_RETURN(NULL);    // Thread is finished
 }
@@ -229,21 +228,22 @@ static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
   // No-op when attempting to change state on a thread that didn't come up.
   // Checking status_ without acquiring the lock first would result in a data
   // race.
-  if (worker->impl_ == NULL) return;
+  WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_;
+  if (impl == NULL) return;
 
-  pthread_mutex_lock(&worker->impl_->mutex_);
+  pthread_mutex_lock(&impl->mutex_);
   if (worker->status_ >= OK) {
     // wait for the worker to finish
     while (worker->status_ != OK) {
-      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+      pthread_cond_wait(&impl->condition_, &impl->mutex_);
     }
     // assign new status and release the working thread if needed
     if (new_status != OK) {
       worker->status_ = new_status;
-      pthread_cond_signal(&worker->impl_->condition_);
+      pthread_cond_signal(&impl->condition_);
     }
   }
-  pthread_mutex_unlock(&worker->impl_->mutex_);
+  pthread_mutex_unlock(&impl->mutex_);
 }
 
 #endif  // WEBP_USE_THREAD
@@ -268,26 +268,28 @@ static int Reset(WebPWorker* const worker) {
   worker->had_error = 0;
   if (worker->status_ < OK) {
 #ifdef WEBP_USE_THREAD
-    worker->impl_ = (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(*worker->impl_));
+    WebPWorkerImpl* const impl =
+        (WebPWorkerImpl*)WebPSafeCalloc(1, sizeof(WebPWorkerImpl));
+    worker->impl_ = (void*)impl;
     if (worker->impl_ == NULL) {
       return 0;
     }
-    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+    if (pthread_mutex_init(&impl->mutex_, NULL)) {
       goto Error;
     }
-    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
-      pthread_mutex_destroy(&worker->impl_->mutex_);
+    if (pthread_cond_init(&impl->condition_, NULL)) {
+      pthread_mutex_destroy(&impl->mutex_);
       goto Error;
     }
-    pthread_mutex_lock(&worker->impl_->mutex_);
-    ok = !pthread_create(&worker->impl_->thread_, NULL, ThreadLoop, worker);
+    pthread_mutex_lock(&impl->mutex_);
+    ok = !pthread_create(&impl->thread_, NULL, ThreadLoop, worker);
     if (ok) worker->status_ = OK;
-    pthread_mutex_unlock(&worker->impl_->mutex_);
+    pthread_mutex_unlock(&impl->mutex_);
     if (!ok) {
-      pthread_mutex_destroy(&worker->impl_->mutex_);
-      pthread_cond_destroy(&worker->impl_->condition_);
+      pthread_mutex_destroy(&impl->mutex_);
+      pthread_cond_destroy(&impl->condition_);
  Error:
-      WebPSafeFree(worker->impl_);
+      WebPSafeFree(impl);
       worker->impl_ = NULL;
       return 0;
     }
@@ -318,11 +320,12 @@ static void Launch(WebPWorker* const worker) {
 static void End(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
   if (worker->impl_ != NULL) {
+    WebPWorkerImpl* const impl = (WebPWorkerImpl*)worker->impl_;
     ChangeState(worker, NOT_OK);
-    pthread_join(worker->impl_->thread_, NULL);
-    pthread_mutex_destroy(&worker->impl_->mutex_);
-    pthread_cond_destroy(&worker->impl_->condition_);
-    WebPSafeFree(worker->impl_);
+    pthread_join(impl->thread_, NULL);
+    pthread_mutex_destroy(&impl->mutex_);
+    pthread_cond_destroy(&impl->condition_);
+    WebPSafeFree(impl);
     worker->impl_ = NULL;
   }
 #else
diff --git a/thirdparty/libwebp/utils/thread_utils.h b/thirdparty/libwebp/src/utils/thread_utils.h
index 8408311855..c8ae6c9033 100644
--- a/thirdparty/libwebp/utils/thread_utils.h
+++ b/thirdparty/libwebp/src/utils/thread_utils.h
@@ -11,14 +11,14 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#ifndef WEBP_UTILS_THREAD_H_
-#define WEBP_UTILS_THREAD_H_
+#ifndef WEBP_UTILS_THREAD_UTILS_H_
+#define WEBP_UTILS_THREAD_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
-#include "../webp/types.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,12 +35,9 @@ typedef enum {
 // arguments (data1 and data2), and should return false in case of error.
 typedef int (*WebPWorkerHook)(void*, void*);
 
-// Platform-dependent implementation details for the worker.
-typedef struct WebPWorkerImpl WebPWorkerImpl;
-
 // Synchronization object used to launch job in the worker thread
 typedef struct {
-  WebPWorkerImpl* impl_;
+  void* impl_;            // platform-dependent implementation worker details
   WebPWorkerStatus status_;
   WebPWorkerHook hook;    // hook to call
   void* data1;            // first argument passed to 'hook'
@@ -78,11 +75,11 @@ typedef struct {
 // decoding takes place. The contents of the interface struct are copied, it
 // is safe to free the corresponding memory after this call. This function is
 // not thread-safe. Return false in case of invalid pointer or methods.
-WEBP_EXTERN(int) WebPSetWorkerInterface(
+WEBP_EXTERN int WebPSetWorkerInterface(
     const WebPWorkerInterface* const winterface);
 
 // Retrieve the currently set thread worker interface.
-WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void);
+WEBP_EXTERN const WebPWorkerInterface* WebPGetWorkerInterface(void);
 
 //------------------------------------------------------------------------------
 
@@ -90,4 +87,4 @@ WEBP_EXTERN(const WebPWorkerInterface*) WebPGetWorkerInterface(void);
 }    // extern "C"
 #endif
 
-#endif  /* WEBP_UTILS_THREAD_H_ */
+#endif  /* WEBP_UTILS_THREAD_UTILS_H_ */
diff --git a/thirdparty/libwebp/utils/utils.c b/thirdparty/libwebp/src/utils/utils.c
index 504d924b60..44d5c14f01 100644
--- a/thirdparty/libwebp/utils/utils.c
+++ b/thirdparty/libwebp/src/utils/utils.c
@@ -13,10 +13,11 @@
 
 #include <stdlib.h>
 #include <string.h>  // for memcpy()
-#include "../webp/decode.h"
-#include "../webp/encode.h"
-#include "../webp/format_constants.h"  // for MAX_PALETTE_SIZE
-#include "./utils.h"
+#include "src/webp/decode.h"
+#include "src/webp/encode.h"
+#include "src/webp/format_constants.h"  // for MAX_PALETTE_SIZE
+#include "src/utils/color_cache_utils.h"
+#include "src/utils/utils.h"
 
 // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
 // alloc/free etc) is printed. For debugging/tuning purpose only (it's slow,
@@ -252,7 +253,6 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
   int num_colors = 0;
   uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
   uint32_t colors[COLOR_HASH_SIZE];
-  static const uint64_t kHashMul = 0x1e35a7bdull;
   const uint32_t* argb = pic->argb;
   const int width = pic->width;
   const int height = pic->height;
@@ -267,7 +267,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
         continue;
       }
       last_pix = argb[x];
-      key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT;
+      key = VP8LHashPix(last_pix, COLOR_HASH_RIGHT_SHIFT);
       while (1) {
         if (!in_use[key]) {
           colors[key] = last_pix;
diff --git a/thirdparty/libwebp/utils/utils.h b/thirdparty/libwebp/src/utils/utils.h
index 3ab459050a..52921bf24e 100644
--- a/thirdparty/libwebp/utils/utils.h
+++ b/thirdparty/libwebp/src/utils/utils.h
@@ -16,14 +16,14 @@
 #define WEBP_UTILS_UTILS_H_
 
 #ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
+#include "src/webp/config.h"
 #endif
 
 #include <assert.h>
 #include <limits.h>
 
-#include "../dsp/dsp.h"
-#include "../webp/types.h"
+#include "src/dsp/dsp.h"
+#include "src/webp/types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -48,13 +48,13 @@ extern "C" {
 // somewhere (like: malloc(num_pixels * sizeof(*something))). That's why this
 // safe malloc() borrows the signature from calloc(), pointing at the dangerous
 // underlying multiply involved.
-WEBP_EXTERN(void*) WebPSafeMalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN void* WebPSafeMalloc(uint64_t nmemb, size_t size);
 // Note that WebPSafeCalloc() expects the second argument type to be 'size_t'
 // in order to favor the "calloc(num_foo, sizeof(foo))" pattern.
-WEBP_EXTERN(void*) WebPSafeCalloc(uint64_t nmemb, size_t size);
+WEBP_EXTERN void* WebPSafeCalloc(uint64_t nmemb, size_t size);
 
 // Companion deallocation function to the above allocations.
-WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
+WEBP_EXTERN void WebPSafeFree(void* const ptr);
 
 //------------------------------------------------------------------------------
 // Alignment
@@ -66,7 +66,7 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
 static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
   uint32_t A;
-  memcpy(&A, (const int*)ptr, sizeof(A));
+  memcpy(&A, ptr, sizeof(A));
   return A;
 }
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
@@ -112,12 +112,12 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
 #define WEBP_NEED_LOG_TABLE_8BIT
 extern const uint8_t WebPLogTable8bit[256];
 static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
-  int log = 0;
+  int log_value = 0;
   while (n >= 256) {
-    log += 8;
+    log_value += 8;
     n >>= 8;
   }
-  return log + WebPLogTable8bit[n];
+  return log_value + WebPLogTable8bit[n];
 }
 
 // Returns (int)floor(log2(n)). n must be > 0.
@@ -147,14 +147,14 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
 struct WebPPicture;
 
 // Copy width x height pixels from 'src' to 'dst' honoring the strides.
-WEBP_EXTERN(void) WebPCopyPlane(const uint8_t* src, int src_stride,
-                                uint8_t* dst, int dst_stride,
-                                int width, int height);
+WEBP_EXTERN void WebPCopyPlane(const uint8_t* src, int src_stride,
+                               uint8_t* dst, int dst_stride,
+                               int width, int height);
 
 // Copy ARGB pixels from 'src' to 'dst' honoring strides. 'src' and 'dst' are
 // assumed to be already allocated and using ARGB data.
-WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
-                                 struct WebPPicture* const dst);
+WEBP_EXTERN void WebPCopyPixels(const struct WebPPicture* const src,
+                                struct WebPPicture* const dst);
 
 //------------------------------------------------------------------------------
 // Unique colors.
@@ -166,8 +166,8 @@ WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
 // MAX_PALETTE_SIZE, also outputs the actual unique colors into 'palette'.
 // Note: 'palette' is assumed to be an array already allocated with at least
 // MAX_PALETTE_SIZE elements.
-WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic,
-                                     uint32_t* const palette);
+WEBP_EXTERN int WebPGetColorPalette(const struct WebPPicture* const pic,
+                                    uint32_t* const palette);
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/webp/decode.h b/thirdparty/libwebp/src/webp/decode.h
index 4c5e74ac36..2165e96c95 100644
--- a/thirdparty/libwebp/webp/decode.h
+++ b/thirdparty/libwebp/src/webp/decode.h
@@ -36,39 +36,39 @@ typedef struct WebPDecoderConfig WebPDecoderConfig;
 
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetDecoderVersion(void);
+WEBP_EXTERN int WebPGetDecoderVersion(void);
 
 // Retrieve basic header information: width, height.
 // This function will also validate the header, returning true on success,
 // false otherwise. '*width' and '*height' are only valid on successful return.
 // Pointers 'width' and 'height' can be passed NULL if deemed irrelevant.
-WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, size_t data_size,
-                             int* width, int* height);
+WEBP_EXTERN int WebPGetInfo(const uint8_t* data, size_t data_size,
+                            int* width, int* height);
 
 // Decodes WebP images pointed to by 'data' and returns RGBA samples, along
 // with the dimensions in *width and *height. The ordering of samples in
 // memory is R, G, B, A, R, G, B, A... in scan order (endian-independent).
 // The returned pointer should be deleted calling WebPFree().
 // Returns NULL in case of error.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, size_t data_size,
-                                     int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning A, R, G, B, A, R, G, B... ordered data.
-WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, size_t data_size,
-                                     int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning B, G, R, A, B, G, R, A... ordered data.
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, size_t data_size,
-                                     int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
 
 // Same as WebPDecodeRGBA, but returning R, G, B, R, G, B... ordered data.
 // If the bitstream contains transparency, it is ignored.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, size_t data_size,
-                                    int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                                   int* width, int* height);
 
 // Same as WebPDecodeRGB, but returning B, G, R, B, G, R... ordered data.
-WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size,
-                                    int* width, int* height);
+WEBP_EXTERN uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
+                                   int* width, int* height);
 
 
 // Decode WebP images pointed to by 'data' to Y'UV format(*). The pointer
@@ -80,13 +80,13 @@ WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size,
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
 // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
-WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, size_t data_size,
-                                    int* width, int* height,
-                                    uint8_t** u, uint8_t** v,
-                                    int* stride, int* uv_stride);
+WEBP_EXTERN uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
+                                   int* width, int* height,
+                                   uint8_t** u, uint8_t** v,
+                                   int* stride, int* uv_stride);
 
 // Releases memory returned by the WebPDecode*() functions above.
-WEBP_EXTERN(void) WebPFree(void* ptr);
+WEBP_EXTERN void WebPFree(void* ptr);
 
 // These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
@@ -96,22 +96,22 @@ WEBP_EXTERN(void) WebPFree(void* ptr);
 // The parameter 'output_stride' specifies the distance (in bytes)
 // between scanlines. Hence, output_buffer_size is expected to be at least
 // output_stride x picture-height.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto(
+WEBP_EXTERN uint8_t* WebPDecodeRGBAInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto(
+WEBP_EXTERN uint8_t* WebPDecodeARGBInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto(
+WEBP_EXTERN uint8_t* WebPDecodeBGRAInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
 // RGB and BGR variants. Here too the transparency information, if present,
 // will be dropped and ignored.
-WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
+WEBP_EXTERN uint8_t* WebPDecodeRGBInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
-WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
+WEBP_EXTERN uint8_t* WebPDecodeBGRInto(
     const uint8_t* data, size_t data_size,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
@@ -122,7 +122,7 @@ WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
 // 'u_size' and 'v_size' respectively.
 // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
 // during decoding (or because some buffers were found to be too small).
-WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
+WEBP_EXTERN uint8_t* WebPDecodeYUVInto(
     const uint8_t* data, size_t data_size,
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
@@ -213,7 +213,7 @@ struct WebPDecBuffer {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer*, int);
+WEBP_EXTERN int WebPInitDecBufferInternal(WebPDecBuffer*, int);
 
 // Initialize the structure as empty. Must be called before any other use.
 // Returns false in case of version mismatch
@@ -223,7 +223,7 @@ static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) {
 
 // Free any memory associated with the buffer. Must always be called last.
 // Note: doesn't free the 'buffer' structure itself.
-WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* buffer);
+WEBP_EXTERN void WebPFreeDecBuffer(WebPDecBuffer* buffer);
 
 //------------------------------------------------------------------------------
 // Enumeration of the status codes
@@ -277,7 +277,7 @@ typedef enum VP8StatusCode {
 // within valid bounds.
 // All other fields of WebPDecBuffer MUST remain constant between calls.
 // Returns NULL if the allocation failed.
-WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
+WEBP_EXTERN WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer);
 
 // This function allocates and initializes an incremental-decoder object, which
 // will output the RGB/A samples specified by 'csp' into a preallocated
@@ -289,7 +289,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
 // colorspace 'csp' is taken into account for allocating this buffer. All other
 // parameters are ignored.
 // Returns NULL if the allocation failed, or if some parameters are invalid.
-WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
+WEBP_EXTERN WebPIDecoder* WebPINewRGB(
     WEBP_CSP_MODE csp,
     uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 
@@ -304,7 +304,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
 // In this case, the output buffer will be automatically allocated (using
 // MODE_YUVA) when decoding starts. All parameters are then ignored.
 // Returns NULL if the allocation failed or if a parameter is invalid.
-WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
+WEBP_EXTERN WebPIDecoder* WebPINewYUVA(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
     uint8_t* v, size_t v_size, int v_stride,
@@ -312,19 +312,19 @@ WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
 
 // Deprecated version of the above, without the alpha plane.
 // Kept for backward compatibility.
-WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
+WEBP_EXTERN WebPIDecoder* WebPINewYUV(
     uint8_t* luma, size_t luma_size, int luma_stride,
     uint8_t* u, size_t u_size, int u_stride,
     uint8_t* v, size_t v_size, int v_stride);
 
 // Deletes the WebPIDecoder object and associated memory. Must always be called
 // if WebPINewDecoder, WebPINewRGB or WebPINewYUV succeeded.
-WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* idec);
+WEBP_EXTERN void WebPIDelete(WebPIDecoder* idec);
 
 // Copies and decodes the next available data. Returns VP8_STATUS_OK when
 // the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more
 // data is expected. Returns error in other cases.
-WEBP_EXTERN(VP8StatusCode) WebPIAppend(
+WEBP_EXTERN VP8StatusCode WebPIAppend(
     WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 
 // A variant of the above function to be used when data buffer contains
@@ -332,7 +332,7 @@ WEBP_EXTERN(VP8StatusCode) WebPIAppend(
 // to the internal memory.
 // Note that the value of the 'data' pointer can change between calls to
 // WebPIUpdate, for instance when the data buffer is resized to fit larger data.
-WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
+WEBP_EXTERN VP8StatusCode WebPIUpdate(
     WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 
 // Returns the RGB/A image decoded so far. Returns NULL if output params
@@ -340,15 +340,16 @@ WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
 // specified during call to WebPINewDecoder() or WebPINewRGB().
 // *last_y is the index of last decoded row in raster scan order. Some pointers
 // (*last_y, *width etc.) can be NULL if corresponding information is not
-// needed.
-WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
+// needed. The values in these pointers are only valid on successful (non-NULL)
+// return.
+WEBP_EXTERN uint8_t* WebPIDecGetRGB(
     const WebPIDecoder* idec, int* last_y,
     int* width, int* height, int* stride);
 
 // Same as above function to get a YUVA image. Returns pointer to the luma
 // plane or NULL in case of error. If there is no alpha information
 // the alpha pointer '*a' will be returned NULL.
-WEBP_EXTERN(uint8_t*) WebPIDecGetYUVA(
+WEBP_EXTERN uint8_t* WebPIDecGetYUVA(
     const WebPIDecoder* idec, int* last_y,
     uint8_t** u, uint8_t** v, uint8_t** a,
     int* width, int* height, int* stride, int* uv_stride, int* a_stride);
@@ -368,7 +369,7 @@ static WEBP_INLINE uint8_t* WebPIDecGetYUV(
 // Returns NULL in case the incremental decoder object is in an invalid state.
 // Otherwise returns the pointer to the internal representation. This structure
 // is read-only, tied to WebPIDecoder's lifespan and should not be modified.
-WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
+WEBP_EXTERN const WebPDecBuffer* WebPIDecodedArea(
     const WebPIDecoder* idec, int* left, int* top, int* width, int* height);
 
 //------------------------------------------------------------------------------
@@ -416,7 +417,7 @@ struct WebPBitstreamFeatures {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
+WEBP_EXTERN VP8StatusCode WebPGetFeaturesInternal(
     const uint8_t*, size_t, WebPBitstreamFeatures*, int);
 
 // Retrieve features from the bitstream. The *features structure is filled
@@ -457,7 +458,7 @@ struct WebPDecoderConfig {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
+WEBP_EXTERN int WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
 
 // Initialize the configuration as empty. This function must always be
 // called first, unless WebPGetFeatures() is to be called.
@@ -477,14 +478,14 @@ static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) {
 // The return WebPIDecoder object must always be deleted calling WebPIDelete().
 // Returns NULL in case of error (and config->status will then reflect
 // the error condition, if available).
-WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size,
-                                       WebPDecoderConfig* config);
+WEBP_EXTERN WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
+                                      WebPDecoderConfig* config);
 
 // Non-incremental version. This version decodes the full data at once, taking
 // 'config' into account. Returns decoding status (which should be VP8_STATUS_OK
 // if the decoding was successful). Note that 'config' cannot be NULL.
-WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size,
-                                      WebPDecoderConfig* config);
+WEBP_EXTERN VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
+                                     WebPDecoderConfig* config);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/thirdparty/libwebp/webp/demux.h b/thirdparty/libwebp/src/webp/demux.h
index 454f6914b2..555d641338 100644
--- a/thirdparty/libwebp/webp/demux.h
+++ b/thirdparty/libwebp/src/webp/demux.h
@@ -71,7 +71,7 @@ typedef struct WebPAnimDecoderOptions WebPAnimDecoderOptions;
 
 // Returns the version number of the demux library, packed in hexadecimal using
 // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetDemuxVersion(void);
+WEBP_EXTERN int WebPGetDemuxVersion(void);
 
 //------------------------------------------------------------------------------
 // Life of a Demux object
@@ -85,7 +85,7 @@ typedef enum WebPDemuxState {
 } WebPDemuxState;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(WebPDemuxer*) WebPDemuxInternal(
+WEBP_EXTERN WebPDemuxer* WebPDemuxInternal(
     const WebPData*, int, WebPDemuxState*, int);
 
 // Parses the full WebP file given by 'data'. For single images the WebP file
@@ -109,27 +109,32 @@ static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
 }
 
 // Frees memory associated with 'dmux'.
-WEBP_EXTERN(void) WebPDemuxDelete(WebPDemuxer* dmux);
+WEBP_EXTERN void WebPDemuxDelete(WebPDemuxer* dmux);
 
 //------------------------------------------------------------------------------
 // Data/information extraction.
 
 typedef enum WebPFormatFeature {
-  WEBP_FF_FORMAT_FLAGS,  // Extended format flags present in the 'VP8X' chunk.
+  WEBP_FF_FORMAT_FLAGS,      // bit-wise combination of WebPFeatureFlags
+                             // corresponding to the 'VP8X' chunk (if present).
   WEBP_FF_CANVAS_WIDTH,
   WEBP_FF_CANVAS_HEIGHT,
-  WEBP_FF_LOOP_COUNT,
-  WEBP_FF_BACKGROUND_COLOR,
-  WEBP_FF_FRAME_COUNT    // Number of frames present in the demux object.
-                         // In case of a partial demux, this is the number of
-                         // frames seen so far, with the last frame possibly
-                         // being partial.
+  WEBP_FF_LOOP_COUNT,        // only relevant for animated file
+  WEBP_FF_BACKGROUND_COLOR,  // idem.
+  WEBP_FF_FRAME_COUNT        // Number of frames present in the demux object.
+                             // In case of a partial demux, this is the number
+                             // of frames seen so far, with the last frame
+                             // possibly being partial.
 } WebPFormatFeature;
 
 // Get the 'feature' value from the 'dmux'.
 // NOTE: values are only valid if WebPDemux() was used or WebPDemuxPartial()
 // returned a state > WEBP_DEMUX_PARSING_HEADER.
-WEBP_EXTERN(uint32_t) WebPDemuxGetI(
+// If 'feature' is WEBP_FF_FORMAT_FLAGS, the returned value is a bit-wise
+// combination of WebPFeatureFlags values.
+// If 'feature' is WEBP_FF_LOOP_COUNT, WEBP_FF_BACKGROUND_COLOR, the returned
+// value is only meaningful if the bitstream is animated.
+WEBP_EXTERN uint32_t WebPDemuxGetI(
     const WebPDemuxer* dmux, WebPFormatFeature feature);
 
 //------------------------------------------------------------------------------
@@ -159,20 +164,20 @@ struct WebPIterator {
 // Returns false if 'dmux' is NULL or frame 'frame_number' is not present.
 // Call WebPDemuxReleaseIterator() when use of the iterator is complete.
 // NOTE: 'dmux' must persist for the lifetime of 'iter'.
-WEBP_EXTERN(int) WebPDemuxGetFrame(
+WEBP_EXTERN int WebPDemuxGetFrame(
     const WebPDemuxer* dmux, int frame_number, WebPIterator* iter);
 
 // Sets 'iter->fragment' to point to the next ('iter->frame_num' + 1) or
 // previous ('iter->frame_num' - 1) frame. These functions do not loop.
 // Returns true on success, false otherwise.
-WEBP_EXTERN(int) WebPDemuxNextFrame(WebPIterator* iter);
-WEBP_EXTERN(int) WebPDemuxPrevFrame(WebPIterator* iter);
+WEBP_EXTERN int WebPDemuxNextFrame(WebPIterator* iter);
+WEBP_EXTERN int WebPDemuxPrevFrame(WebPIterator* iter);
 
 // Releases any memory associated with 'iter'.
 // Must be called before any subsequent calls to WebPDemuxGetChunk() on the same
 // iter. Also, must be called before destroying the associated WebPDemuxer with
 // WebPDemuxDelete().
-WEBP_EXTERN(void) WebPDemuxReleaseIterator(WebPIterator* iter);
+WEBP_EXTERN void WebPDemuxReleaseIterator(WebPIterator* iter);
 
 //------------------------------------------------------------------------------
 // Chunk iteration.
@@ -197,20 +202,20 @@ struct WebPChunkIterator {
 // payloads are accessed through WebPDemuxGetFrame() and related functions.
 // Call WebPDemuxReleaseChunkIterator() when use of the iterator is complete.
 // NOTE: 'dmux' must persist for the lifetime of the iterator.
-WEBP_EXTERN(int) WebPDemuxGetChunk(const WebPDemuxer* dmux,
-                                   const char fourcc[4], int chunk_number,
-                                   WebPChunkIterator* iter);
+WEBP_EXTERN int WebPDemuxGetChunk(const WebPDemuxer* dmux,
+                                  const char fourcc[4], int chunk_number,
+                                  WebPChunkIterator* iter);
 
 // Sets 'iter->chunk' to point to the next ('iter->chunk_num' + 1) or previous
 // ('iter->chunk_num' - 1) chunk. These functions do not loop.
 // Returns true on success, false otherwise.
-WEBP_EXTERN(int) WebPDemuxNextChunk(WebPChunkIterator* iter);
-WEBP_EXTERN(int) WebPDemuxPrevChunk(WebPChunkIterator* iter);
+WEBP_EXTERN int WebPDemuxNextChunk(WebPChunkIterator* iter);
+WEBP_EXTERN int WebPDemuxPrevChunk(WebPChunkIterator* iter);
 
 // Releases any memory associated with 'iter'.
 // Must be called before destroying the associated WebPDemuxer with
 // WebPDemuxDelete().
-WEBP_EXTERN(void) WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
+WEBP_EXTERN void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
 
 //------------------------------------------------------------------------------
 // WebPAnimDecoder API
@@ -252,7 +257,7 @@ struct WebPAnimDecoderOptions {
 };
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(int) WebPAnimDecoderOptionsInitInternal(
+WEBP_EXTERN int WebPAnimDecoderOptionsInitInternal(
     WebPAnimDecoderOptions*, int);
 
 // Should always be called, to initialize a fresh WebPAnimDecoderOptions
@@ -266,7 +271,7 @@ static WEBP_INLINE int WebPAnimDecoderOptionsInit(
 }
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(WebPAnimDecoder*) WebPAnimDecoderNewInternal(
+WEBP_EXTERN WebPAnimDecoder* WebPAnimDecoderNewInternal(
     const WebPData*, const WebPAnimDecoderOptions*, int);
 
 // Creates and initializes a WebPAnimDecoder object.
@@ -301,8 +306,8 @@ struct WebPAnimInfo {
 //   info - (out) global information fetched from the animation.
 // Returns:
 //   True on success.
-WEBP_EXTERN(int) WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
-                                        WebPAnimInfo* info);
+WEBP_EXTERN int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
+                                       WebPAnimInfo* info);
 
 // Fetch the next frame from 'dec' based on options supplied to
 // WebPAnimDecoderNew(). This will be a fully reconstructed canvas of size
@@ -316,8 +321,8 @@ WEBP_EXTERN(int) WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec,
 // Returns:
 //   False if any of the arguments are NULL, or if there is a parsing or
 //   decoding error, or if there are no more frames. Otherwise, returns true.
-WEBP_EXTERN(int) WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
-                                        uint8_t** buf, int* timestamp);
+WEBP_EXTERN int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
+                                       uint8_t** buf, int* timestamp);
 
 // Check if there are more frames left to decode.
 // Parameters:
@@ -325,7 +330,7 @@ WEBP_EXTERN(int) WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
 // Returns:
 //   True if 'dec' is not NULL and some frames are yet to be decoded.
 //   Otherwise, returns false.
-WEBP_EXTERN(int) WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
+WEBP_EXTERN int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
 
 // Resets the WebPAnimDecoder object, so that next call to
 // WebPAnimDecoderGetNext() will restart decoding from 1st frame. This would be
@@ -333,7 +338,7 @@ WEBP_EXTERN(int) WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec);
 // info.loop_count times) without destroying and recreating the 'dec' object.
 // Parameters:
 //   dec - (in/out) decoder instance to be reset
-WEBP_EXTERN(void) WebPAnimDecoderReset(WebPAnimDecoder* dec);
+WEBP_EXTERN void WebPAnimDecoderReset(WebPAnimDecoder* dec);
 
 // Grab the internal demuxer object.
 // Getting the demuxer object can be useful if one wants to use operations only
@@ -343,13 +348,13 @@ WEBP_EXTERN(void) WebPAnimDecoderReset(WebPAnimDecoder* dec);
 //
 // Parameters:
 //   dec - (in) decoder instance from which the demuxer object is to be fetched.
-WEBP_EXTERN(const WebPDemuxer*) WebPAnimDecoderGetDemuxer(
+WEBP_EXTERN const WebPDemuxer* WebPAnimDecoderGetDemuxer(
     const WebPAnimDecoder* dec);
 
 // Deletes the WebPAnimDecoder object.
 // Parameters:
 //   dec - (in/out) decoder instance to be deleted
-WEBP_EXTERN(void) WebPAnimDecoderDelete(WebPAnimDecoder* dec);
+WEBP_EXTERN void WebPAnimDecoderDelete(WebPAnimDecoder* dec);
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/thirdparty/libwebp/webp/encode.h b/thirdparty/libwebp/src/webp/encode.h
index 35fde1d052..7ec3543dc2 100644
--- a/thirdparty/libwebp/webp/encode.h
+++ b/thirdparty/libwebp/src/webp/encode.h
@@ -35,7 +35,7 @@ typedef struct WebPMemoryWriter WebPMemoryWriter;
 
 // Return the encoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetEncoderVersion(void);
+WEBP_EXTERN int WebPGetEncoderVersion(void);
 
 //------------------------------------------------------------------------------
 // One-stop-shop call! No questions asked:
@@ -46,37 +46,37 @@ WEBP_EXTERN(int) WebPGetEncoderVersion(void);
 // These functions compress using the lossy format, and the quality_factor
 // can go from 0 (smaller output, lower quality) to 100 (best quality,
 // larger output).
-WEBP_EXTERN(size_t) WebPEncodeRGB(const uint8_t* rgb,
+WEBP_EXTERN size_t WebPEncodeRGB(const uint8_t* rgb,
+                                 int width, int height, int stride,
+                                 float quality_factor, uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeBGR(const uint8_t* bgr,
+                                 int width, int height, int stride,
+                                 float quality_factor, uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeRGBA(const uint8_t* rgba,
                                   int width, int height, int stride,
                                   float quality_factor, uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeBGR(const uint8_t* bgr,
+WEBP_EXTERN size_t WebPEncodeBGRA(const uint8_t* bgra,
                                   int width, int height, int stride,
                                   float quality_factor, uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeRGBA(const uint8_t* rgba,
-                                   int width, int height, int stride,
-                                   float quality_factor, uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeBGRA(const uint8_t* bgra,
-                                   int width, int height, int stride,
-                                   float quality_factor, uint8_t** output);
 
 // These functions are the equivalent of the above, but compressing in a
 // lossless manner. Files are usually larger than lossy format, but will
 // not suffer any compression loss.
-WEBP_EXTERN(size_t) WebPEncodeLosslessRGB(const uint8_t* rgb,
+WEBP_EXTERN size_t WebPEncodeLosslessRGB(const uint8_t* rgb,
+                                         int width, int height, int stride,
+                                         uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeLosslessBGR(const uint8_t* bgr,
+                                         int width, int height, int stride,
+                                         uint8_t** output);
+WEBP_EXTERN size_t WebPEncodeLosslessRGBA(const uint8_t* rgba,
                                           int width, int height, int stride,
                                           uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeLosslessBGR(const uint8_t* bgr,
+WEBP_EXTERN size_t WebPEncodeLosslessBGRA(const uint8_t* bgra,
                                           int width, int height, int stride,
                                           uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeLosslessRGBA(const uint8_t* rgba,
-                                           int width, int height, int stride,
-                                           uint8_t** output);
-WEBP_EXTERN(size_t) WebPEncodeLosslessBGRA(const uint8_t* bgra,
-                                           int width, int height, int stride,
-                                           uint8_t** output);
 
 // Releases memory returned by the WebPEncode*() functions above.
-WEBP_EXTERN(void) WebPFree(void* ptr);
+WEBP_EXTERN void WebPFree(void* ptr);
 
 //------------------------------------------------------------------------------
 // Coding parameters
@@ -93,12 +93,15 @@ typedef enum WebPImageHint {
 // Compression parameters.
 struct WebPConfig {
   int lossless;           // Lossless encoding (0=lossy(default), 1=lossless).
-  float quality;          // between 0 (smallest file) and 100 (biggest)
+  float quality;          // between 0 and 100. For lossy, 0 gives the smallest
+                          // size and 100 the largest. For lossless, this
+                          // parameter is the amount of effort put into the
+                          // compression: 0 is the fastest but gives larger
+                          // files compared to the slowest, but best, 100.
   int method;             // quality/speed trade-off (0=fast, 6=slower-better)
 
   WebPImageHint image_hint;  // Hint for image type (lossless only for now).
 
-  // Parameters related to lossy compression only:
   int target_size;        // if non-zero, set the desired target size in bytes.
                           // Takes precedence over the 'compression' parameter.
   float target_PSNR;      // if non-zero, specifies the minimal distortion to
@@ -159,7 +162,7 @@ typedef enum WebPPreset {
 } WebPPreset;
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int);
+WEBP_EXTERN int WebPConfigInitInternal(WebPConfig*, WebPPreset, float, int);
 
 // Should always be called, to initialize a fresh WebPConfig structure before
 // modification. Returns false in case of version mismatch. WebPConfigInit()
@@ -186,15 +189,15 @@ static WEBP_INLINE int WebPConfigPreset(WebPConfig* config,
 // speed and final compressed size.
 // This function will overwrite several fields from config: 'method', 'quality'
 // and 'lossless'. Returns false in case of parameter error.
-WEBP_EXTERN(int) WebPConfigLosslessPreset(WebPConfig* config, int level);
+WEBP_EXTERN int WebPConfigLosslessPreset(WebPConfig* config, int level);
 
 // Returns true if 'config' is non-NULL and all configuration parameters are
 // within their valid ranges.
-WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* config);
+WEBP_EXTERN int WebPValidateConfig(const WebPConfig* config);
 
 //------------------------------------------------------------------------------
 // Input / Output
-// Structure for storing auxiliary statistics (mostly for lossy encoding).
+// Structure for storing auxiliary statistics.
 
 struct WebPAuxStats {
   int coded_size;         // final size
@@ -242,16 +245,16 @@ struct WebPMemoryWriter {
 };
 
 // The following must be called first before any use.
-WEBP_EXTERN(void) WebPMemoryWriterInit(WebPMemoryWriter* writer);
+WEBP_EXTERN void WebPMemoryWriterInit(WebPMemoryWriter* writer);
 
 // The following must be called to deallocate writer->mem memory. The 'writer'
 // object itself is not deallocated.
-WEBP_EXTERN(void) WebPMemoryWriterClear(WebPMemoryWriter* writer);
+WEBP_EXTERN void WebPMemoryWriterClear(WebPMemoryWriter* writer);
 // The custom writer to be used with WebPMemoryWriter as custom_ptr. Upon
 // completion, writer.mem and writer.size will hold the coded data.
 // writer.mem must be freed by calling WebPMemoryWriterClear.
-WEBP_EXTERN(int) WebPMemoryWrite(const uint8_t* data, size_t data_size,
-                                 const WebPPicture* picture);
+WEBP_EXTERN int WebPMemoryWrite(const uint8_t* data, size_t data_size,
+                                const WebPPicture* picture);
 
 // Progress hook, called from time to time to report progress. It can return
 // false to request an abort of the encoding process, or true otherwise if
@@ -354,7 +357,7 @@ struct WebPPicture {
 };
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture*, int);
+WEBP_EXTERN int WebPPictureInitInternal(WebPPicture*, int);
 
 // Should always be called, to initialize the structure. Returns false in case
 // of version mismatch. WebPPictureInit() must have succeeded before using the
@@ -371,20 +374,20 @@ static WEBP_INLINE int WebPPictureInit(WebPPicture* picture) {
 // Allocate y/u/v buffers as per colorspace/width/height specification.
 // Note! This function will free the previous buffer if needed.
 // Returns false in case of memory error.
-WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureAlloc(WebPPicture* picture);
 
 // Release the memory allocated by WebPPictureAlloc() or WebPPictureImport*().
 // Note that this function does _not_ free the memory used by the 'picture'
 // object itself.
 // Besides memory (which is reclaimed) all other fields of 'picture' are
 // preserved.
-WEBP_EXTERN(void) WebPPictureFree(WebPPicture* picture);
+WEBP_EXTERN void WebPPictureFree(WebPPicture* picture);
 
 // Copy the pixels of *src into *dst, using WebPPictureAlloc. Upon return, *dst
 // will fully own the copied pixels (this is not a view). The 'dst' picture need
 // not be initialized as its content is overwritten.
 // Returns false in case of memory allocation error.
-WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
+WEBP_EXTERN int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 
 // Compute the single distortion for packed planes of samples.
 // 'src' will be compared to 'ref', and the raw distortion stored into
@@ -393,19 +396,19 @@ WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 // 'x_step' is the horizontal stride (in bytes) between samples.
 // 'src/ref_stride' is the byte distance between rows.
 // Returns false in case of error (bad parameter, memory allocation error, ...).
-WEBP_EXTERN(int) WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
-                                     const uint8_t* ref, size_t ref_stride,
-                                     int width, int height,
-                                     size_t x_step,
-                                     int type,   // 0 = PSNR, 1 = SSIM, 2 = LSIM
-                                     float* distortion, float* result);
+WEBP_EXTERN int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                                    const uint8_t* ref, size_t ref_stride,
+                                    int width, int height,
+                                    size_t x_step,
+                                    int type,   // 0 = PSNR, 1 = SSIM, 2 = LSIM
+                                    float* distortion, float* result);
 
 // Compute PSNR, SSIM or LSIM distortion metric between two pictures. Results
 // are in dB, stored in result[] in the B/G/R/A/All order. The distortion is
 // always performed using ARGB samples. Hence if the input is YUV(A), the
 // picture will be internally converted to ARGB (just for the measurement).
 // Warning: this function is rather CPU-intensive.
-WEBP_EXTERN(int) WebPPictureDistortion(
+WEBP_EXTERN int WebPPictureDistortion(
     const WebPPicture* src, const WebPPicture* ref,
     int metric_type,           // 0 = PSNR, 1 = SSIM, 2 = LSIM
     float result[5]);
@@ -418,8 +421,8 @@ WEBP_EXTERN(int) WebPPictureDistortion(
 // must be fully be comprised inside the 'src' source picture. If the source
 // picture uses the YUV420 colorspace, the top and left coordinates will be
 // snapped to even values.
-WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture,
-                                 int left, int top, int width, int height);
+WEBP_EXTERN int WebPPictureCrop(WebPPicture* picture,
+                                int left, int top, int width, int height);
 
 // Extracts a view from 'src' picture into 'dst'. The rectangle for the view
 // is defined by the top-left corner pixel coordinates (left, top) as well
@@ -432,42 +435,42 @@ WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* picture,
 // with WebPPictureInit() if it is different from 'src', since its content will
 // be overwritten.
 // Returns false in case of memory allocation error or invalid parameters.
-WEBP_EXTERN(int) WebPPictureView(const WebPPicture* src,
-                                 int left, int top, int width, int height,
-                                 WebPPicture* dst);
+WEBP_EXTERN int WebPPictureView(const WebPPicture* src,
+                                int left, int top, int width, int height,
+                                WebPPicture* dst);
 
 // Returns true if the 'picture' is actually a view and therefore does
 // not own the memory for pixels.
-WEBP_EXTERN(int) WebPPictureIsView(const WebPPicture* picture);
+WEBP_EXTERN int WebPPictureIsView(const WebPPicture* picture);
 
 // Rescale a picture to new dimension width x height.
 // If either 'width' or 'height' (but not both) is 0 the corresponding
 // dimension will be calculated preserving the aspect ratio.
 // No gamma correction is applied.
 // Returns false in case of error (invalid parameter or insufficient memory).
-WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* pic, int width, int height);
+WEBP_EXTERN int WebPPictureRescale(WebPPicture* pic, int width, int height);
 
 // Colorspace conversion function to import RGB samples.
 // Previous buffer will be free'd, if any.
 // *rgb buffer should have a size of at least height * rgb_stride.
 // Returns false in case of memory error.
-WEBP_EXTERN(int) WebPPictureImportRGB(
+WEBP_EXTERN int WebPPictureImportRGB(
     WebPPicture* picture, const uint8_t* rgb, int rgb_stride);
 // Same, but for RGBA buffer.
-WEBP_EXTERN(int) WebPPictureImportRGBA(
+WEBP_EXTERN int WebPPictureImportRGBA(
     WebPPicture* picture, const uint8_t* rgba, int rgba_stride);
 // Same, but for RGBA buffer. Imports the RGB direct from the 32-bit format
 // input buffer ignoring the alpha channel. Avoids needing to copy the data
 // to a temporary 24-bit RGB buffer to import the RGB only.
-WEBP_EXTERN(int) WebPPictureImportRGBX(
+WEBP_EXTERN int WebPPictureImportRGBX(
     WebPPicture* picture, const uint8_t* rgbx, int rgbx_stride);
 
 // Variants of the above, but taking BGR(A|X) input.
-WEBP_EXTERN(int) WebPPictureImportBGR(
+WEBP_EXTERN int WebPPictureImportBGR(
     WebPPicture* picture, const uint8_t* bgr, int bgr_stride);
-WEBP_EXTERN(int) WebPPictureImportBGRA(
+WEBP_EXTERN int WebPPictureImportBGRA(
     WebPPicture* picture, const uint8_t* bgra, int bgra_stride);
-WEBP_EXTERN(int) WebPPictureImportBGRX(
+WEBP_EXTERN int WebPPictureImportBGRX(
     WebPPicture* picture, const uint8_t* bgrx, int bgrx_stride);
 
 // Converts picture->argb data to the YUV420A format. The 'colorspace'
@@ -476,14 +479,14 @@ WEBP_EXTERN(int) WebPPictureImportBGRX(
 // non-opaque transparent values is detected, and 'colorspace' will be
 // adjusted accordingly. Note that this method is lossy.
 // Returns false in case of error.
-WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
-                                       WebPEncCSP /*colorspace = WEBP_YUV420*/);
+WEBP_EXTERN int WebPPictureARGBToYUVA(WebPPicture* picture,
+                                      WebPEncCSP /*colorspace = WEBP_YUV420*/);
 
 // Same as WebPPictureARGBToYUVA(), but the conversion is done using
 // pseudo-random dithering with a strength 'dithering' between
 // 0.0 (no dithering) and 1.0 (maximum dithering). This is useful
 // for photographic picture.
-WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
+WEBP_EXTERN int WebPPictureARGBToYUVADithered(
     WebPPicture* picture, WebPEncCSP colorspace, float dithering);
 
 // Performs 'sharp' RGBA->YUVA420 downsampling and colorspace conversion.
@@ -491,9 +494,9 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
 // method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
 // and sharper YUV representation.
 // Returns false in case of error.
-WEBP_EXTERN(int) WebPPictureSharpARGBToYUVA(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureSharpARGBToYUVA(WebPPicture* picture);
 // kept for backward compatibility:
-WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A. The conversion from YUV420 to
@@ -501,22 +504,22 @@ WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 // Note that the use of this colorspace is discouraged if one has access to the
 // raw ARGB samples, since using YUV420 is comparatively lossy.
 // Returns false in case of error.
-WEBP_EXTERN(int) WebPPictureYUVAToARGB(WebPPicture* picture);
+WEBP_EXTERN int WebPPictureYUVAToARGB(WebPPicture* picture);
 
 // Helper function: given a width x height plane of RGBA or YUV(A) samples
-// clean-up the YUV or RGB samples under fully transparent area, to help
-// compressibility (no guarantee, though).
-WEBP_EXTERN(void) WebPCleanupTransparentArea(WebPPicture* picture);
+// clean-up or smoothen the YUV or RGB samples under fully transparent area,
+// to help compressibility (no guarantee, though).
+WEBP_EXTERN void WebPCleanupTransparentArea(WebPPicture* picture);
 
 // Scan the picture 'picture' for the presence of non fully opaque alpha values.
 // Returns true in such case. Otherwise returns false (indicating that the
 // alpha plane can be ignored altogether e.g.).
-WEBP_EXTERN(int) WebPPictureHasTransparency(const WebPPicture* picture);
+WEBP_EXTERN int WebPPictureHasTransparency(const WebPPicture* picture);
 
 // Remove the transparency information (if present) by blending the color with
 // the background color 'background_rgb' (specified as 24bit RGB triplet).
 // After this call, all alpha values are reset to 0xff.
-WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
+WEBP_EXTERN void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
 
 //------------------------------------------------------------------------------
 // Main call
@@ -531,7 +534,7 @@ WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
 // the former for lossy encoding, and the latter for lossless encoding
 // (when config.lossless is true). Automatic conversion from one format to
 // another is provided but they both incur some loss.
-WEBP_EXTERN(int) WebPEncode(const WebPConfig* config, WebPPicture* picture);
+WEBP_EXTERN int WebPEncode(const WebPConfig* config, WebPPicture* picture);
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/webp/format_constants.h b/thirdparty/libwebp/src/webp/format_constants.h
index 329fc8a3b0..329fc8a3b0 100644
--- a/thirdparty/libwebp/webp/format_constants.h
+++ b/thirdparty/libwebp/src/webp/format_constants.h
diff --git a/thirdparty/libwebp/webp/mux.h b/thirdparty/libwebp/src/webp/mux.h
index daccc65e86..28bb4a41c9 100644
--- a/thirdparty/libwebp/webp/mux.h
+++ b/thirdparty/libwebp/src/webp/mux.h
@@ -98,13 +98,13 @@ typedef enum WebPChunkId {
 
 // Returns the version number of the mux library, packed in hexadecimal using
 // 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetMuxVersion(void);
+WEBP_EXTERN int WebPGetMuxVersion(void);
 
 //------------------------------------------------------------------------------
 // Life of a Mux object
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(WebPMux*) WebPNewInternal(int);
+WEBP_EXTERN WebPMux* WebPNewInternal(int);
 
 // Creates an empty mux object.
 // Returns:
@@ -117,13 +117,13 @@ static WEBP_INLINE WebPMux* WebPMuxNew(void) {
 // Deletes the mux object.
 // Parameters:
 //   mux - (in/out) object to be deleted
-WEBP_EXTERN(void) WebPMuxDelete(WebPMux* mux);
+WEBP_EXTERN void WebPMuxDelete(WebPMux* mux);
 
 //------------------------------------------------------------------------------
 // Mux creation.
 
 // Internal, version-checked, entry point
-WEBP_EXTERN(WebPMux*) WebPMuxCreateInternal(const WebPData*, int, int);
+WEBP_EXTERN WebPMux* WebPMuxCreateInternal(const WebPData*, int, int);
 
 // Creates a mux object from raw data given in WebP RIFF format.
 // Parameters:
@@ -160,7 +160,7 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 //                               or if fourcc corresponds to an image chunk.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk(
+WEBP_EXTERN WebPMuxError WebPMuxSetChunk(
     WebPMux* mux, const char fourcc[4], const WebPData* chunk_data,
     int copy_data);
 
@@ -176,7 +176,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk(
 //                               or if fourcc corresponds to an image chunk.
 //   WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given id.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetChunk(
+WEBP_EXTERN WebPMuxError WebPMuxGetChunk(
     const WebPMux* mux, const char fourcc[4], WebPData* chunk_data);
 
 // Deletes the chunk with the given 'fourcc' from the mux object.
@@ -189,7 +189,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetChunk(
 //                               or if fourcc corresponds to an image chunk.
 //   WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given fourcc.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxDeleteChunk(
+WEBP_EXTERN WebPMuxError WebPMuxDeleteChunk(
     WebPMux* mux, const char fourcc[4]);
 
 //------------------------------------------------------------------------------
@@ -222,7 +222,7 @@ struct WebPMuxFrameInfo {
 //   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
+WEBP_EXTERN WebPMuxError WebPMuxSetImage(
     WebPMux* mux, const WebPData* bitstream, int copy_data);
 
 // Adds a frame at the end of the mux object.
@@ -241,7 +241,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
 //                               or if content of 'frame' is invalid.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame(
+WEBP_EXTERN WebPMuxError WebPMuxPushFrame(
     WebPMux* mux, const WebPMuxFrameInfo* frame, int copy_data);
 
 // Gets the nth frame from the mux object.
@@ -259,7 +259,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame(
 //   WEBP_MUX_BAD_DATA - if nth frame chunk in mux is invalid.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame(
+WEBP_EXTERN WebPMuxError WebPMuxGetFrame(
     const WebPMux* mux, uint32_t nth, WebPMuxFrameInfo* frame);
 
 // Deletes a frame from the mux object.
@@ -272,7 +272,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame(
 //   WEBP_MUX_NOT_FOUND - If there are less than nth frames in the mux object
 //                        before deletion.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth);
+WEBP_EXTERN WebPMuxError WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth);
 
 //------------------------------------------------------------------------------
 // Animation.
@@ -296,7 +296,7 @@ struct WebPMuxAnimParams {
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams(
+WEBP_EXTERN WebPMuxError WebPMuxSetAnimationParams(
     WebPMux* mux, const WebPMuxAnimParams* params);
 
 // Gets the animation parameters from the mux object.
@@ -307,7 +307,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams(
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
 //   WEBP_MUX_NOT_FOUND - if ANIM chunk is not present in mux object.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
+WEBP_EXTERN WebPMuxError WebPMuxGetAnimationParams(
     const WebPMux* mux, WebPMuxAnimParams* params);
 
 //------------------------------------------------------------------------------
@@ -328,8 +328,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
 //   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL; or
 //                               width or height are invalid or out of bounds
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxSetCanvasSize(WebPMux* mux,
-                                               int width, int height);
+WEBP_EXTERN WebPMuxError WebPMuxSetCanvasSize(WebPMux* mux,
+                                              int width, int height);
 
 // Gets the canvas size from the mux object.
 // Note: This method assumes that the VP8X chunk, if present, is up-to-date.
@@ -343,8 +343,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetCanvasSize(WebPMux* mux,
 //   WEBP_MUX_INVALID_ARGUMENT - if mux, width or height is NULL.
 //   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux,
-                                               int* width, int* height);
+WEBP_EXTERN WebPMuxError WebPMuxGetCanvasSize(const WebPMux* mux,
+                                              int* width, int* height);
 
 // Gets the feature flags from the mux object.
 // Note: This method assumes that the VP8X chunk, if present, is up-to-date.
@@ -359,8 +359,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux,
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or flags is NULL.
 //   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux,
-                                             uint32_t* flags);
+WEBP_EXTERN WebPMuxError WebPMuxGetFeatures(const WebPMux* mux,
+                                            uint32_t* flags);
 
 // Gets number of chunks with the given 'id' in the mux object.
 // Parameters:
@@ -370,8 +370,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux,
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux, or num_elements is NULL.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
-                                           WebPChunkId id, int* num_elements);
+WEBP_EXTERN WebPMuxError WebPMuxNumChunks(const WebPMux* mux,
+                                          WebPChunkId id, int* num_elements);
 
 // Assembles all chunks in WebP RIFF format and returns in 'assembled_data'.
 // This function also validates the mux object.
@@ -388,8 +388,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or assembled_data is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
-WEBP_EXTERN(WebPMuxError) WebPMuxAssemble(WebPMux* mux,
-                                          WebPData* assembled_data);
+WEBP_EXTERN WebPMuxError WebPMuxAssemble(WebPMux* mux,
+                                         WebPData* assembled_data);
 
 //------------------------------------------------------------------------------
 // WebPAnimEncoder API
@@ -442,7 +442,7 @@ struct WebPAnimEncoderOptions {
 };
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(int) WebPAnimEncoderOptionsInitInternal(
+WEBP_EXTERN int WebPAnimEncoderOptionsInitInternal(
     WebPAnimEncoderOptions*, int);
 
 // Should always be called, to initialize a fresh WebPAnimEncoderOptions
@@ -455,7 +455,7 @@ static WEBP_INLINE int WebPAnimEncoderOptionsInit(
 }
 
 // Internal, version-checked, entry point.
-WEBP_EXTERN(WebPAnimEncoder*) WebPAnimEncoderNewInternal(
+WEBP_EXTERN WebPAnimEncoder* WebPAnimEncoderNewInternal(
     int, int, const WebPAnimEncoderOptions*, int);
 
 // Creates and initializes a WebPAnimEncoder object.
@@ -490,7 +490,7 @@ static WEBP_INLINE WebPAnimEncoder* WebPAnimEncoderNew(
 // Returns:
 //   On error, returns false and frame->error_code is set appropriately.
 //   Otherwise, returns true.
-WEBP_EXTERN(int) WebPAnimEncoderAdd(
+WEBP_EXTERN int WebPAnimEncoderAdd(
     WebPAnimEncoder* enc, struct WebPPicture* frame, int timestamp_ms,
     const struct WebPConfig* config);
 
@@ -503,8 +503,8 @@ WEBP_EXTERN(int) WebPAnimEncoderAdd(
 //   webp_data - (out) generated WebP bitstream.
 // Returns:
 //   True on success.
-WEBP_EXTERN(int) WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
-                                         WebPData* webp_data);
+WEBP_EXTERN int WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
+                                        WebPData* webp_data);
 
 // Get error string corresponding to the most recent call using 'enc'. The
 // returned string is owned by 'enc' and is valid only until the next call to
@@ -514,12 +514,12 @@ WEBP_EXTERN(int) WebPAnimEncoderAssemble(WebPAnimEncoder* enc,
 // Returns:
 //   NULL if 'enc' is NULL. Otherwise, returns the error string if the last call
 //   to 'enc' had an error, or an empty string if the last call was a success.
-WEBP_EXTERN(const char*) WebPAnimEncoderGetError(WebPAnimEncoder* enc);
+WEBP_EXTERN const char* WebPAnimEncoderGetError(WebPAnimEncoder* enc);
 
 // Deletes the WebPAnimEncoder object.
 // Parameters:
 //   enc - (in/out) object to be deleted
-WEBP_EXTERN(void) WebPAnimEncoderDelete(WebPAnimEncoder* enc);
+WEBP_EXTERN void WebPAnimEncoderDelete(WebPAnimEncoder* enc);
 
 //------------------------------------------------------------------------------
 
diff --git a/thirdparty/libwebp/webp/mux_types.h b/thirdparty/libwebp/src/webp/mux_types.h
index b37e2c67aa..b37e2c67aa 100644
--- a/thirdparty/libwebp/webp/mux_types.h
+++ b/thirdparty/libwebp/src/webp/mux_types.h
diff --git a/thirdparty/libwebp/webp/types.h b/thirdparty/libwebp/src/webp/types.h
index 98fff35a11..989a763f0d 100644
--- a/thirdparty/libwebp/webp/types.h
+++ b/thirdparty/libwebp/src/webp/types.h
@@ -40,9 +40,9 @@ typedef long long int int64_t;
 // This explicitly marks library functions and allows for changing the
 // signature for e.g., Windows DLL builds.
 # if defined(__GNUC__) && __GNUC__ >= 4
-#  define WEBP_EXTERN(type) extern __attribute__ ((visibility ("default"))) type
+#  define WEBP_EXTERN extern __attribute__ ((visibility ("default")))
 # else
-#  define WEBP_EXTERN(type) extern type
+#  define WEBP_EXTERN extern
 # endif  /* __GNUC__ >= 4 */
 #endif  /* WEBP_EXTERN */
 
diff --git a/thirdparty/squish/Add-Decompress-Bc5-to-Squish.patch b/thirdparty/squish/Add-Decompress-Bc5-to-Squish.patch
new file mode 100644
index 0000000000..1e06a8d318
--- /dev/null
+++ b/thirdparty/squish/Add-Decompress-Bc5-to-Squish.patch
@@ -0,0 +1,143 @@
+From 7b64cc4c8b0be0443741483bf65909f5140179c0 Mon Sep 17 00:00:00 2001
+From: Orkun <orkuntezerm@gmail.com>
+Date: Sun, 19 Nov 2017 02:24:31 +0300
+Subject: [PATCH] Fix #12220: Add Decompress Bc5 to Squish
+
+This Commit fixes the corrupted file preview described in #12220.
+Added DecompressColourBc5 function to squish.
+---
+ thirdparty/squish/colourblock.cpp | 85 +++++++++++++++++++++++++++++++++++++++
+ thirdparty/squish/colourblock.h   |  3 ++
+ thirdparty/squish/squish.cpp      |  8 +++-
+ 3 files changed, 95 insertions(+), 1 deletion(-)
+
+diff --git a/thirdparty/squish/colourblock.cpp b/thirdparty/squish/colourblock.cpp
+index af8b98036..3de46382c 100644
+--- a/thirdparty/squish/colourblock.cpp
++++ b/thirdparty/squish/colourblock.cpp
+@@ -211,4 +211,89 @@ void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+     }
+ }
+ 
++// -- Godot start --
++void DecompressColourBc5( u8* rgba, void const* block)
++{
++    // get the block bytes
++    u8 const* bytes = reinterpret_cast< u8 const* >( block );
++
++    // unpack the endpoints
++    u8 codes[16];
++    int red_0 = bytes[0];
++    int red_1 = bytes[1];
++
++    codes[0] = red_0;
++    codes[1] = red_1;
++    codes[6] = 0.0f;
++    codes[7] = 1.0f;
++    // generate the midpoints
++    if(red_0 > red_1)
++    {
++        for( int i = 2; i < 8; ++i )
++        {
++            codes[i] = ((8-i)*red_0 + (i-1)*red_1)/7;
++        }
++    }
++    else
++    {
++        for( int i = 2; i < 6; ++i )
++        {
++            codes[i] = ((6-i)*red_0 + (i-1)*red_1)/5;
++        }
++    }
++
++    int green_0 = bytes[8];
++    int green_1 = bytes[9];
++
++    codes[0 + 8] = green_0;
++    codes[1 + 8] = green_1;
++    codes[6 + 8] = 0.0f;
++    codes[7 + 8] = 1.0f;
++    // generate the midpoints
++    if(green_0 > green_1)
++    {
++        for( int i = 2; i < 8; ++i )
++        {
++            codes[i + 8] = ((8-i)*green_0 + (i-1)*green_1)/7;
++        }
++    }
++    else
++    {
++        for( int i = 2; i < 6; ++i )
++        {
++            codes[i + 8] = ((6-i)*green_0 + (i-1)*green_1)/5;
++        }
++    }
++
++    u8 indices[32];
++    for( int i = 0; i < 4; ++i )
++    {
++        u8 packed = bytes[2 + i];
++        u8* red_ind = indices + 4*i;
++
++        red_ind[0] = packed & 0x3;
++        red_ind[1] = ( packed >> 2 ) & 0x3;
++        red_ind[2] = ( packed >> 4 ) & 0x3;
++        red_ind[3] = ( packed >> 6 ) & 0x3;
++
++        packed = bytes[8 + i];
++        u8* green_ind = indices + 4*i + 16;
++        green_ind[0] = packed & 0x3;
++        green_ind[1] = ( packed >> 2 ) & 0x3;
++        green_ind[2] = ( packed >> 4 ) & 0x3;
++        green_ind[3] = ( packed >> 6 ) & 0x3;
++    }
++
++    // store out the colours
++    for( int i = 0; i < 16; ++i )
++    {
++        rgba[4*i] = codes[indices[i]];
++        rgba[4*i +1] = codes[indices[i + 16] + 8];
++        rgba[4*i +2] = 0;
++        rgba[4*i +3] = 255;
++    }
++}
++// -- GODOT end --
++
++
+ } // namespace squish
+diff --git a/thirdparty/squish/colourblock.h b/thirdparty/squish/colourblock.h
+index fee2cd7c5..3cb9b7e3b 100644
+--- a/thirdparty/squish/colourblock.h
++++ b/thirdparty/squish/colourblock.h
+@@ -35,6 +35,9 @@ void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void*
+ void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+ 
+ void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
++// -- GODOT start --
++void DecompressColourBc5( u8* rgba, void const* block );
++// -- GODOT end --
+ 
+ } // namespace squish
+ 
+diff --git a/thirdparty/squish/squish.cpp b/thirdparty/squish/squish.cpp
+index 1d22a64ad..fd11a147d 100644
+--- a/thirdparty/squish/squish.cpp
++++ b/thirdparty/squish/squish.cpp
+@@ -135,7 +135,13 @@ void Decompress( u8* rgba, void const* block, int flags )
+         colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
+ 
+     // decompress colour
+-    DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
++    // -- GODOT start --
++    //DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
++    if(( flags & ( kBc5 ) ) != 0)
++        DecompressColourBc5( rgba, colourBlock);
++    else
++        DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
++    // -- GODOT end --
+ 
+     // decompress alpha separately if necessary
+     if( ( flags & kDxt3 ) != 0 )
+-- 
+2.13.6
+
diff --git a/thirdparty/squish/colourblock.cpp b/thirdparty/squish/colourblock.cpp
index af8b980365..3de46382c0 100644
--- a/thirdparty/squish/colourblock.cpp
+++ b/thirdparty/squish/colourblock.cpp
@@ -211,4 +211,89 @@ void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
     }
 }
 
+// -- Godot start --
+void DecompressColourBc5( u8* rgba, void const* block)
+{
+    // get the block bytes
+    u8 const* bytes = reinterpret_cast< u8 const* >( block );
+
+    // unpack the endpoints
+    u8 codes[16];
+    int red_0 = bytes[0];
+    int red_1 = bytes[1];
+
+    codes[0] = red_0;
+    codes[1] = red_1;
+    codes[6] = 0.0f;
+    codes[7] = 1.0f;
+    // generate the midpoints
+    if(red_0 > red_1)
+    {
+        for( int i = 2; i < 8; ++i )
+        {
+            codes[i] = ((8-i)*red_0 + (i-1)*red_1)/7;
+        }
+    }
+    else
+    {
+        for( int i = 2; i < 6; ++i )
+        {
+            codes[i] = ((6-i)*red_0 + (i-1)*red_1)/5;
+        }
+    }
+
+    int green_0 = bytes[8];
+    int green_1 = bytes[9];
+
+    codes[0 + 8] = green_0;
+    codes[1 + 8] = green_1;
+    codes[6 + 8] = 0.0f;
+    codes[7 + 8] = 1.0f;
+    // generate the midpoints
+    if(green_0 > green_1)
+    {
+        for( int i = 2; i < 8; ++i )
+        {
+            codes[i + 8] = ((8-i)*green_0 + (i-1)*green_1)/7;
+        }
+    }
+    else
+    {
+        for( int i = 2; i < 6; ++i )
+        {
+            codes[i + 8] = ((6-i)*green_0 + (i-1)*green_1)/5;
+        }
+    }
+
+    u8 indices[32];
+    for( int i = 0; i < 4; ++i )
+    {
+        u8 packed = bytes[2 + i];
+        u8* red_ind = indices + 4*i;
+
+        red_ind[0] = packed & 0x3;
+        red_ind[1] = ( packed >> 2 ) & 0x3;
+        red_ind[2] = ( packed >> 4 ) & 0x3;
+        red_ind[3] = ( packed >> 6 ) & 0x3;
+
+        packed = bytes[8 + i];
+        u8* green_ind = indices + 4*i + 16;
+        green_ind[0] = packed & 0x3;
+        green_ind[1] = ( packed >> 2 ) & 0x3;
+        green_ind[2] = ( packed >> 4 ) & 0x3;
+        green_ind[3] = ( packed >> 6 ) & 0x3;
+    }
+
+    // store out the colours
+    for( int i = 0; i < 16; ++i )
+    {
+        rgba[4*i] = codes[indices[i]];
+        rgba[4*i +1] = codes[indices[i + 16] + 8];
+        rgba[4*i +2] = 0;
+        rgba[4*i +3] = 255;
+    }
+}
+// -- GODOT end --
+
+
 } // namespace squish
diff --git a/thirdparty/squish/colourblock.h b/thirdparty/squish/colourblock.h
index fee2cd7c5d..3cb9b7e3b9 100644
--- a/thirdparty/squish/colourblock.h
+++ b/thirdparty/squish/colourblock.h
@@ -35,6 +35,9 @@ void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void*
 void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
 
 void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+// -- GODOT start --
+void DecompressColourBc5( u8* rgba, void const* block );
+// -- GODOT end --
 
 } // namespace squish
 
diff --git a/thirdparty/squish/squish.cpp b/thirdparty/squish/squish.cpp
index 1d22a64ad6..fd11a147de 100644
--- a/thirdparty/squish/squish.cpp
+++ b/thirdparty/squish/squish.cpp
@@ -135,7 +135,13 @@ void Decompress( u8* rgba, void const* block, int flags )
         colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
 
     // decompress colour
-    DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+    // -- GODOT start --
+    //DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+    if(( flags & ( kBc5 ) ) != 0)
+        DecompressColourBc5( rgba, colourBlock);
+    else
+        DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+    // -- GODOT end --
 
     // decompress alpha separately if necessary
     if( ( flags & kDxt3 ) != 0 )
diff --git a/thirdparty/thekla_atlas/LICENSE b/thirdparty/thekla_atlas/LICENSE
new file mode 100644
index 0000000000..164e7d3a2b
--- /dev/null
+++ b/thirdparty/thekla_atlas/LICENSE
@@ -0,0 +1,8 @@
+Copyright (c) 2013 Thekla, Inc
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/thirdparty/thekla_atlas/nvconfig.h b/thirdparty/thekla_atlas/nvconfig.h
new file mode 100644
index 0000000000..815bc3ec75
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvconfig.h
@@ -0,0 +1,37 @@
+#ifndef NV_CONFIG
+#define NV_CONFIG
+
+#if NV_OS_DARWIN
+
+// Hardcoded.
+
+#define NV_HAVE_UNISTD_H
+#define NV_HAVE_STDARG_H
+#define NV_HAVE_SIGNAL_H
+#define NV_HAVE_EXECINFO_H
+//#define NV_HAVE_MALLOC_H
+
+#else
+
+//#define HAVE_UNISTD_H
+#define NV_HAVE_STDARG_H
+//#define HAVE_SIGNAL_H
+//#define HAVE_EXECINFO_H
+//#define HAVE_MALLOC_H
+
+#endif
+
+//#define HAVE_OPENMP // Only in MSVC pro edition.
+
+//#cmakedefine HAVE_PNG
+//#cmakedefine HAVE_JPEG
+//#cmakedefine HAVE_TIFF
+//#cmakedefine HAVE_OPENEXR
+//#cmakedefine HAVE_FREEIMAGE
+#if !NV_OS_IOS
+#define NV_HAVE_STBIMAGE
+#endif
+
+//#cmakedefine HAVE_MAYA
+
+#endif // NV_CONFIG
diff --git a/thirdparty/thekla_atlas/nvcore/Array.h b/thirdparty/thekla_atlas/nvcore/Array.h
new file mode 100644
index 0000000000..b295cb2b0c
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Array.h
@@ -0,0 +1,182 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_ARRAY_H
+#define NV_CORE_ARRAY_H
+
+/*
+This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be 
+using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers
+are not supported.
+
+Note also that push_back and resize does not support inserting arguments elements that are in the same 
+container. This is forbidden to prevent an extra copy.
+*/
+
+
+#include "Memory.h"
+#include "Debug.h"
+#include "ForEach.h" // PseudoIndex
+
+
+namespace nv 
+{
+    class Stream;
+
+    /**
+    * Replacement for std::vector that is easier to debug and provides
+    * some nice foreach enumerators. 
+    */
+    template<typename T>
+    class NVCORE_CLASS Array {
+    public:
+        typedef uint size_type;
+
+        // Default constructor.
+        NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {}
+
+        // Copy constructor.
+        NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(a.m_buffer, a.m_size);
+        }
+
+        // Constructor that initializes the vector with the given elements.
+        NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(ptr, num);
+        }
+
+        // Allocate array.
+        NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            setArrayCapacity(capacity);
+        }
+
+        // Destructor.
+        NV_FORCEINLINE ~Array() {
+            clear();
+            free<T>(m_buffer);
+        }
+
+
+        /// Const element access.
+        NV_FORCEINLINE const T & operator[]( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE const T & at( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Element access.
+        NV_FORCEINLINE T & operator[] ( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE T & at( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint size() const { return m_size; }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint count() const { return m_size; }
+
+        /// Get vector capacity.
+        NV_FORCEINLINE uint capacity() const { return m_capacity; }
+
+        /// Get const vector pointer.
+        NV_FORCEINLINE const T * buffer() const { return m_buffer; }
+
+        /// Get vector pointer.
+        NV_FORCEINLINE T * buffer() { return m_buffer; }
+
+        /// Provide begin/end pointers for C++11 range-based for loops.
+        NV_FORCEINLINE T * begin() { return m_buffer; }
+        NV_FORCEINLINE T * end() { return m_buffer + m_size; }
+        NV_FORCEINLINE const T * begin() const { return m_buffer; }
+        NV_FORCEINLINE const T * end() const { return m_buffer + m_size; }
+
+        /// Is vector empty.
+        NV_FORCEINLINE bool isEmpty() const { return m_size == 0; }
+
+        /// Is a null vector.
+        NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; }
+
+
+        T & append();
+        void push_back( const T & val );
+        void pushBack( const T & val );
+        Array<T> & append( const T & val );
+        Array<T> & operator<< ( T & t );
+        void pop_back();
+        void popBack(uint count = 1);
+        void popFront(uint count = 1);
+        const T & back() const;
+        T & back();
+        const T & front() const;
+        T & front();
+        bool contains(const T & e) const;
+        bool find(const T & element, uint * indexPtr) const;
+        bool find(const T & element, uint begin, uint end, uint * indexPtr) const;
+        void removeAt(uint index);
+        bool remove(const T & element);
+        void insertAt(uint index, const T & val = T());
+        void append(const Array<T> & other);
+        void append(const T other[], uint count);
+        void replaceWithLast(uint index);
+        void resize(uint new_size);
+        void resize(uint new_size, const T & elem);
+        void fill(const T & elem);
+        void clear();
+        void shrink();
+        void reserve(uint desired_size);
+        void copy(const T * data, uint count);
+        Array<T> & operator=( const Array<T> & a );
+        T * release();
+
+
+        // Array enumerator.
+        typedef uint PseudoIndex;
+
+        NV_FORCEINLINE PseudoIndex start() const { return 0; }
+        NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
+        NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
+
+#if NV_NEED_PSEUDOINDEX_WRAPPER
+        NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
+            return m_buffer[i(this)];
+        }
+        NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const {
+            return m_buffer[i(this)];
+        }
+#endif
+
+        // Friends.
+        template <typename Typ> 
+        friend Stream & operator<< ( Stream & s, Array<Typ> & p );
+
+        template <typename Typ>
+        friend void swap(Array<Typ> & a, Array<Typ> & b);
+
+
+    protected:
+
+        void setArraySize(uint new_size);
+        void setArrayCapacity(uint new_capacity);
+
+        T * m_buffer;
+        uint m_capacity;
+        uint m_size;
+
+    };
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_H
diff --git a/thirdparty/thekla_atlas/nvcore/Array.inl b/thirdparty/thekla_atlas/nvcore/Array.inl
new file mode 100644
index 0000000000..0b4de28ba9
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Array.inl
@@ -0,0 +1,452 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_ARRAY_INL
+#define NV_CORE_ARRAY_INL
+
+#include "Array.h"
+
+#include "Stream.h"
+#include "Utils.h" // swap
+
+#include <string.h>	// memmove
+#include <new> // for placement new
+
+
+
+namespace nv 
+{
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::append()
+    {
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size);
+
+        return m_buffer[old_size]; // Return reference to last element.
+    }
+
+    // Push an element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::push_back( const T & val )
+    {
+#if 1
+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);
+
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size, val);
+#else
+        uint new_size = m_size + 1;
+
+        if (new_size > m_capacity)
+        {
+            // @@ Is there any way to avoid this copy?
+            // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy?
+            // @@ Assert instead of copy?
+            const T copy(val);	// create a copy in case value is inside of this array.
+
+            setArraySize(new_size);
+
+            new (m_buffer+new_size-1) T(copy);
+        }
+        else
+        {
+            m_size = new_size;
+            new(m_buffer+new_size-1) T(val);
+        }
+#endif // 0/1
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pushBack( const T & val )
+    {
+        push_back(val);
+    }
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )
+    {
+        push_back(val);
+        return *this;
+    }
+
+    // Qt like push operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator<< ( T & t )
+    {
+        push_back(t);
+        return *this;
+    }
+
+    // Pop the element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pop_back()
+    {
+        nvDebugCheck( m_size > 0 );
+        resize( m_size - 1 );
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popBack(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        resize(m_size - count);
+    }
+
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popFront(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        //resize(m_size - count);
+
+        if (m_size == count) {
+            clear();
+        }
+        else {
+            destroy_range(m_buffer, 0, count);
+
+            memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count));
+
+            m_size -= count;
+        }
+
+    }
+
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::back() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::back()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::front() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::front()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Check if the given element is contained in the array.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::contains(const T & e) const
+    {
+        return find(e, NULL);
+    }
+
+    // Return true if element found.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint * indexPtr) const
+    {
+        return find(element, 0, m_size, indexPtr);
+    }
+
+    // Return true if element found within the given range.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint begin, uint end, uint * indexPtr) const
+    {
+        return ::nv::find(element, m_buffer, begin, end, indexPtr);
+    }
+
+
+    // Remove the element at the given index. This is an expensive operation!
+    template <typename T>
+    void Array<T>::removeAt(uint index)
+    {
+        nvDebugCheck(index >= 0 && index < m_size);
+
+        if (m_size == 1) {
+            clear();
+        }
+        else {
+            m_buffer[index].~T();
+
+            memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index));
+            m_size--;
+        }
+    }
+
+    // Remove the first instance of the given element.
+    template <typename T>
+    bool Array<T>::remove(const T & element)
+    {
+        uint index;
+        if (find(element, &index)) {
+            removeAt(index);
+            return true;
+        }
+        return false;
+    }
+
+    // Insert the given element at the given index shifting all the elements up.
+    template <typename T>
+    void Array<T>::insertAt(uint index, const T & val/*=T()*/)
+    {
+        nvDebugCheck( index >= 0 && index <= m_size );
+
+        setArraySize(m_size + 1);
+
+        if (index < m_size - 1) {
+            memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index));
+        }
+
+        // Copy-construct into the newly opened slot.
+        new(m_buffer+index) T(val);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::append(const Array<T> & other)
+    {
+        append(other.m_buffer, other.m_size);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    void Array<T>::append(const T other[], uint count)
+    {
+        if (count > 0) {
+            const uint old_size = m_size;
+
+            setArraySize(m_size + count);
+
+            for (uint i = 0; i < count; i++ ) {
+                new(m_buffer + old_size + i) T(other[i]);
+            }
+        }
+    }
+
+
+    // Remove the given element by replacing it with the last one.
+    template <typename T> 
+    void Array<T>::replaceWithLast(uint index)
+    {
+        nvDebugCheck( index < m_size );
+        nv::swap(m_buffer[index], back());      // @@ Is this OK when index == size-1?
+        (m_buffer+m_size-1)->~T();
+        m_size--;
+    }
+
+    // Resize the vector preserving existing elements.
+    template <typename T> 
+    void Array<T>::resize(uint new_size)
+    {
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call default constructors
+        construct_range(m_buffer, new_size, old_size);
+    }
+
+
+    // Resize the vector preserving existing elements and initializing the
+    // new ones with the given value.
+    template <typename T> 
+    void Array<T>::resize(uint new_size, const T & elem)
+    {
+        nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size);
+
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call copy constructors
+        construct_range(m_buffer, new_size, old_size, elem);
+    }
+
+    // Fill array with the given value.
+    template <typename T>
+    void Array<T>::fill(const T & elem)
+    {
+        fill(m_buffer, m_size, elem);
+    }
+
+    // Clear the buffer.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::clear()
+    {
+        nvDebugCheck(isValidPtr(m_buffer));
+
+        // Destruct old elements
+        destroy_range(m_buffer, 0, m_size);
+
+        m_size = 0;
+    }
+
+    // Shrink the allocated vector.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::shrink()
+    {
+        if (m_size < m_capacity) {
+            setArrayCapacity(m_size);
+        }
+    }
+
+    // Preallocate space.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::reserve(uint desired_size)
+    {
+        if (desired_size > m_capacity) {
+            setArrayCapacity(desired_size);
+        }
+    }
+
+    // Copy elements to this array. Resizes it if needed.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
+    {
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);
+
+        setArraySize(count);
+
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
+    }
+
+    // Assignment operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator=( const Array<T> & a )
+    {
+        copy(a.m_buffer, a.m_size);
+        return *this;
+    }
+
+    // Release ownership of allocated memory and returns pointer to it.
+    template <typename T>
+    T * Array<T>::release() {
+        T * tmp = m_buffer;
+        m_buffer = NULL;
+        m_capacity = 0;
+        m_size = 0;
+        return tmp;
+    }
+
+
+
+    // Change array size.
+    template <typename T> 
+    inline void Array<T>::setArraySize(uint new_size) {
+        m_size = new_size;
+
+        if (new_size > m_capacity) {
+            uint new_buffer_size;
+            if (m_capacity == 0) {
+                // first allocation is exact
+                new_buffer_size = new_size;
+            }
+            else {
+                // following allocations grow array by 25%
+                new_buffer_size = new_size + (new_size >> 2);
+            }
+
+            setArrayCapacity( new_buffer_size );
+        }
+    }
+
+    // Change array capacity.
+    template <typename T> 
+    inline void Array<T>::setArrayCapacity(uint new_capacity) {
+        nvDebugCheck(new_capacity >= m_size);
+
+        if (new_capacity == 0) {
+            // free the buffer.
+            if (m_buffer != NULL) {
+                free<T>(m_buffer);
+                m_buffer = NULL;
+            }
+        }
+        else {
+            // realloc the buffer
+            m_buffer = realloc<T>(m_buffer, new_capacity);
+        }
+
+        m_capacity = new_capacity;
+    }
+
+    // Array serialization.
+    template <typename Typ> 
+    inline Stream & operator<< ( Stream & s, Array<Typ> & p )
+    {
+        if (s.isLoading()) {
+            uint size;
+            s << size;
+            p.resize( size );
+        }
+        else {
+            s << p.m_size;
+        }
+
+        for (uint i = 0; i < p.m_size; i++) {
+            s << p.m_buffer[i];
+        }
+
+        return s;
+    }
+
+    // Swap the members of the two given vectors.
+    template <typename Typ>
+    inline void swap(Array<Typ> & a, Array<Typ> & b)
+    {
+        nv::swap(a.m_buffer, b.m_buffer);
+        nv::swap(a.m_capacity, b.m_capacity);
+        nv::swap(a.m_size, b.m_size);
+    }
+
+
+} // nv namespace
+
+// IC: These functions are for compatibility with the Foreach macro in The Witness.
+template <typename T> inline int item_count(const nv::Array<T> & array) { return array.count(); }
+template <typename T> inline const T & item_at(const nv::Array<T> & array, int i) { return array.at(i); }
+template <typename T> inline T & item_at(nv::Array<T> & array, int i) { return array.at(i); }
+template <typename T> inline int item_advance(const nv::Array<T> & array, int i) { return ++i; }
+template <typename T> inline int item_remove(nv::Array<T> & array, int i) { array.replaceWithLast(i); return i - 1; }
+
+template <typename T> inline int item_count(const nv::Array<T> * array) { return array->count(); }
+template <typename T> inline const T & item_at(const nv::Array<T> * array, int i) { return array->at(i); }
+template <typename T> inline T & item_at(nv::Array<T> * array, int i) { return array->at(i); }
+template <typename T> inline int item_advance(const nv::Array<T> * array, int i) { return ++i; }
+template <typename T> inline int item_remove(nv::Array<T> * array, int i) { array->replaceWithLast(i); return i - 1; }
+
+
+#endif // NV_CORE_ARRAY_INL
diff --git a/thirdparty/thekla_atlas/nvcore/BitArray.h b/thirdparty/thekla_atlas/nvcore/BitArray.h
new file mode 100644
index 0000000000..23cf880694
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/BitArray.h
@@ -0,0 +1,250 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_BITARRAY_H
+#define NV_CORE_BITARRAY_H
+
+#include "nvcore.h"
+#include "Array.inl"
+
+namespace nv
+{
+
+    // @@ Uh, this could be much faster.
+    inline uint countSetBits(uint32 x) {
+        uint count = 0;
+        for(; x != 0; x >>= 1) {
+            count += (x & 1);
+        }
+        return count;
+    }
+
+    // @@ This is even more lame. What was I thinking?
+    inline uint countSetBits(uint32 x, int bits) {
+        uint count = 0;
+        for(; x != 0 && bits != 0; x >>= 1, bits--) {
+            count += (x & 1);
+        }
+        return count;
+    }
+
+    // See "Conditionally set or clear bits without branching" at http://graphics.stanford.edu/~seander/bithacks.html
+    inline uint setBits(uint w, uint m, bool b) {
+        return (w & ~m) | (-int(b) & m);
+    }
+
+
+
+    // Simple bit array.
+    class BitArray
+    {
+    public:
+
+        BitArray() {}
+        BitArray(uint sz) {
+            resize(sz);
+        }
+
+        uint size() const { return m_size; }
+        void clear() { resize(0); }
+
+        void resize(uint new_size)
+        {
+            m_size = new_size;
+            m_wordArray.resize( (m_size + 31) >> 5 );
+        }
+
+        void resize(uint new_size, bool init)
+        {
+            //if (new_size == m_size) return;
+
+            uint old_size = m_size;
+            uint size_mod_32 = old_size & 31;
+            uint last_word_index = ((old_size + 31) >> 5) - 1;
+            uint mask = (1 << size_mod_32) - 1;
+
+            uint init_dword;
+            if (init) {
+                if (size_mod_32) m_wordArray[last_word_index] |= ~mask;
+                init_dword = ~0;
+            }
+            else {
+                if (size_mod_32) m_wordArray[last_word_index] &= mask;
+                init_dword = 0;
+            }
+
+            m_size = new_size;
+            m_wordArray.resize((new_size + 31) >> 5, init_dword);
+
+            // Make sure new bits are initialized correctly.
+            /*for (uint i = old_size; i < new_size; i++) {
+                nvCheck(bitAt(i) == init);
+            }*/
+        }
+
+
+        /// Get bit.
+        bool bitAt(uint b) const
+        {
+            nvDebugCheck( b < m_size );
+            return (m_wordArray[b >> 5] & (1 << (b & 31))) != 0;
+        }
+
+        // It may be useful to pack mulitple bit arrays together interleaving their bits.
+        uint bitsAt(uint idx, uint count) const
+        {
+            //nvDebugCheck(count == 2 || count == 4 || count == 8 || count == 16 || count == 32);
+            nvDebugCheck(count == 2);   // @@ Hardcoded for two.
+            uint b = idx * count;
+            nvDebugCheck(b < m_size);
+            return (m_wordArray[b >> 5] & (0x3 << (b & 31))) >> (b & 31);
+        }
+
+        // It would be useful to have a function to set two bits simultaneously.
+        /*void setBitsAt(uint idx, uint count, uint bits) const
+        {
+            //nvDebugCheck(count == 2 || count == 4 || count == 8 || count == 16 || count == 32);
+            nvDebugCheck(count == 2);   // @@ Hardcoded for two.
+            uint b = idx * count;
+            nvDebugCheck(b < m_size);
+            return (m_wordArray[b >> 5] & (0x3 << (b & 31))) >> (b & 31);
+        }*/
+
+
+
+        // Set a bit.
+        void setBitAt(uint idx)
+        {
+            nvDebugCheck(idx < m_size);
+            m_wordArray[idx >> 5] |=  (1 << (idx & 31));
+        }
+
+        // Clear a bit.
+        void clearBitAt(uint idx)
+        {
+            nvDebugCheck(idx < m_size);
+            m_wordArray[idx >> 5] &= ~(1 << (idx & 31));
+        }
+
+        // Toggle a bit.
+        void toggleBitAt(uint idx)
+        {
+            nvDebugCheck(idx < m_size);
+            m_wordArray[idx >> 5] ^= (1 << (idx & 31));
+        }
+
+        // Set a bit to the given value. @@ Rename modifyBitAt? 
+        void setBitAt(uint idx, bool b)
+        {
+            nvDebugCheck(idx < m_size);
+            m_wordArray[idx >> 5] = setBits(m_wordArray[idx >> 5], 1 << (idx & 31), b);
+            nvDebugCheck(bitAt(idx) == b);
+        }
+
+        void append(bool value)
+        {
+            resize(m_size + 1);
+            setBitAt(m_size - 1, value);
+        }
+
+
+        // Clear all the bits.
+        void clearAll()
+        {
+            memset(m_wordArray.buffer(), 0, m_wordArray.size() * sizeof(uint));
+        }
+
+        // Set all the bits.
+        void setAll()
+        {
+            memset(m_wordArray.buffer(), 0xFF, m_wordArray.size() * sizeof(uint));
+        }
+
+        // Toggle all the bits.
+        void toggleAll()
+        {
+            const uint wordCount = m_wordArray.count();
+            for(uint b = 0; b < wordCount; b++) {
+                m_wordArray[b] ^= 0xFFFFFFFF;
+            }
+        }
+
+        // Count the number of bits set.
+        uint countSetBits() const
+        {
+            const uint num = m_wordArray.size();
+            if( num == 0 ) {
+                return 0;
+            }
+
+            uint count = 0;				
+            for(uint i = 0; i < num - 1; i++) {
+                count += nv::countSetBits(m_wordArray[i]);
+            }
+            count += nv::countSetBits(m_wordArray[num - 1], m_size & 31);
+
+            //piDebugCheck(count + countClearBits() == m_size);
+            return count;
+        }
+
+        // Count the number of bits clear.
+        uint countClearBits() const {
+
+            const uint num = m_wordArray.size();
+            if( num == 0 ) {
+                return 0;
+            }
+
+            uint count = 0;
+            for(uint i = 0; i < num - 1; i++) {
+                count += nv::countSetBits(~m_wordArray[i]);
+            }
+            count += nv::countSetBits(~m_wordArray[num - 1], m_size & 31);
+
+            //piDebugCheck(count + countSetBits() == m_size);
+            return count;
+        }
+
+        friend void swap(BitArray & a, BitArray & b)
+        {
+            swap(a.m_size, b.m_size);
+            swap(a.m_wordArray, b.m_wordArray);
+        }
+
+        void operator &= (const BitArray & other) {
+            if (other.m_size != m_size) {
+                resize(other.m_size);
+            }
+
+            const uint wordCount = m_wordArray.count();
+            for (uint i = 0; i < wordCount; i++) {
+                m_wordArray[i] &= other.m_wordArray[i];
+            }
+        }
+
+        void operator |= (const BitArray & other) {
+            if (other.m_size != m_size) {
+                resize(other.m_size);
+            }
+
+            const uint wordCount = m_wordArray.count();
+            for (uint i = 0; i < wordCount; i++) {
+                m_wordArray[i] |= other.m_wordArray[i];
+            }
+        }
+
+
+    private:
+
+        // Number of bits stored.
+        uint m_size;
+
+        // Array of bits.
+        Array<uint> m_wordArray;
+
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_BITARRAY_H
+
diff --git a/thirdparty/thekla_atlas/nvcore/Debug.cpp b/thirdparty/thekla_atlas/nvcore/Debug.cpp
new file mode 100644
index 0000000000..4980ffa916
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Debug.cpp
@@ -0,0 +1,1357 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "Debug.h"
+#include "Array.inl"
+#include "StrLib.h" // StringBuilder
+
+#include "StdStream.h" // fileOpen
+
+#include <stdlib.h>
+
+// Extern
+#if NV_OS_WIN32 //&& NV_CC_MSVC
+#   define WIN32_LEAN_AND_MEAN
+#   define VC_EXTRALEAN
+#   include <windows.h>
+#   include <direct.h>
+// -- GODOT start -
+#   include <crtdbg.h>
+#   if _MSC_VER < 1300
+#       define DECLSPEC_DEPRECATED
+// VC6: change this path to your Platform SDK headers
+#       include <dbghelp.h> // must be XP version of file
+//      include "M:\\dev7\\vs\\devtools\\common\\win32sdk\\include\\dbghelp.h"
+#   else
+// VC7: ships with updated headers
+#       include <dbghelp.h>
+#   endif
+// -- GODOT end -
+#   pragma comment(lib,"dbghelp.lib")
+#endif
+
+#if NV_OS_XBOX
+#    include <Xtl.h>
+#    ifdef _DEBUG
+#        include <xbdm.h>
+#    endif //_DEBUG
+#endif //NV_OS_XBOX
+
+#if !NV_OS_WIN32 && defined(NV_HAVE_SIGNAL_H)
+#   include <signal.h>
+#endif
+
+#if NV_OS_UNIX
+#   include <unistd.h> // getpid
+#endif
+
+#if NV_OS_LINUX && defined(NV_HAVE_EXECINFO_H)
+#   include <execinfo.h> // backtrace
+#   if NV_CC_GNUC // defined(NV_HAVE_CXXABI_H)
+#       include <cxxabi.h>
+#   endif
+#endif
+
+#if NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#   include <sys/types.h>
+#   include <sys/param.h>
+#   include <sys/sysctl.h> // sysctl
+#   if !defined(NV_OS_OPENBSD)
+#       include <sys/ucontext.h>
+#   endif
+#   if defined(NV_HAVE_EXECINFO_H) // only after OSX 10.5
+#       include <execinfo.h> // backtrace
+#       if NV_CC_GNUC // defined(NV_HAVE_CXXABI_H)
+#           include <cxxabi.h>
+#       endif
+#   endif
+#endif
+
+#if NV_OS_ORBIS
+#include <libdbg.h>
+#endif
+
+#if NV_OS_DURANGO
+#include "Windows.h"
+#include <winnt.h>
+#include <crtdbg.h>
+#include <dbghelp.h>
+#include <errhandlingapi.h>
+#define NV_USE_SEPARATE_THREAD 0
+#else
+#define NV_USE_SEPARATE_THREAD 1
+#endif
+
+
+
+using namespace nv;
+
+namespace 
+{
+
+    static MessageHandler * s_message_handler = NULL;
+    static AssertHandler * s_assert_handler = NULL;
+
+    static bool s_sig_handler_enabled = false;
+    static bool s_interactive = true;
+
+#if (NV_OS_WIN32 && NV_CC_MSVC) || NV_OS_DURANGO
+
+    // Old exception filter.
+    static LPTOP_LEVEL_EXCEPTION_FILTER s_old_exception_filter = NULL;
+
+#elif !NV_OS_WIN32 && defined(NV_HAVE_SIGNAL_H)
+
+    // Old signal handlers.
+    struct sigaction s_old_sigsegv;
+    struct sigaction s_old_sigtrap;
+    struct sigaction s_old_sigfpe;
+    struct sigaction s_old_sigbus;
+
+#endif
+
+// -- GODOT start -
+#if NV_OS_WIN32 || NV_OS_DURANGO
+// -- GODOT end -
+
+    // We should try to simplify the top level filter as much as possible.
+    // http://www.nynaeve.net/?p=128
+
+    // The critical section enforcing the requirement that only one exception be
+    // handled by a handler at a time.
+    static CRITICAL_SECTION s_handler_critical_section;
+
+#if NV_USE_SEPARATE_THREAD
+    // Semaphores used to move exception handling between the exception thread
+    // and the handler thread.  handler_start_semaphore_ is signalled by the
+    // exception thread to wake up the handler thread when an exception occurs.
+    // handler_finish_semaphore_ is signalled by the handler thread to wake up
+    // the exception thread when handling is complete.
+    static HANDLE s_handler_start_semaphore = NULL;
+    static HANDLE s_handler_finish_semaphore = NULL;
+
+    // The exception handler thread.
+    static HANDLE s_handler_thread = NULL;
+
+    static DWORD s_requesting_thread_id = 0;
+    static EXCEPTION_POINTERS * s_exception_info = NULL;
+
+#endif // NV_USE_SEPARATE_THREAD
+
+
+    struct MinidumpCallbackContext {
+        ULONG64 memory_base;
+        ULONG memory_size;
+        bool finished;
+    };
+
+#if NV_OS_WIN32
+    // static
+    static BOOL CALLBACK miniDumpWriteDumpCallback(PVOID context, const PMINIDUMP_CALLBACK_INPUT callback_input, PMINIDUMP_CALLBACK_OUTPUT callback_output)
+    {
+        switch (callback_input->CallbackType)
+        {
+        case MemoryCallback: {
+            MinidumpCallbackContext* callback_context = reinterpret_cast<MinidumpCallbackContext*>(context);
+            if (callback_context->finished)
+                return FALSE;
+
+            // Include the specified memory region.
+            callback_output->MemoryBase = callback_context->memory_base;
+            callback_output->MemorySize = callback_context->memory_size;
+            callback_context->finished = true;
+            return TRUE;
+        }
+
+        // Include all modules.
+        case IncludeModuleCallback:
+        case ModuleCallback:
+            return TRUE;
+
+        // Include all threads.
+        case IncludeThreadCallback:
+        case ThreadCallback:
+            return TRUE;
+
+        // Stop receiving cancel callbacks.
+        case CancelCallback:
+            callback_output->CheckCancel = FALSE;
+            callback_output->Cancel = FALSE;
+            return TRUE;
+        }
+
+        // Ignore other callback types.
+        return FALSE;
+    }
+#endif
+
+    static bool writeMiniDump(EXCEPTION_POINTERS * pExceptionInfo)
+    {
+#if NV_OS_DURANGO
+        // Get a handle to the minidump method.
+        typedef BOOL(WINAPI* MiniDumpWriteDumpPfn) (
+            _In_ HANDLE hProcess,
+            _In_ DWORD ProcessId,
+            _In_ HANDLE hFile,
+            _In_ MINIDUMP_TYPE DumpType,
+            _In_opt_ PMINIDUMP_EXCEPTION_INFORMATION ExceptionParam,
+            _In_opt_ PMINIDUMP_USER_STREAM_INFORMATION UserStreamParam,
+            _Reserved_ PVOID CallbackParam
+            );
+        MiniDumpWriteDumpPfn MiniDumpWriteDump = NULL;
+        HMODULE hToolHelpModule = ::LoadLibraryW(L"toolhelpx.dll");
+        if (hToolHelpModule != INVALID_HANDLE_VALUE) {
+            MiniDumpWriteDump = reinterpret_cast<MiniDumpWriteDumpPfn>(::GetProcAddress(hToolHelpModule, "MiniDumpWriteDump"));
+            if (!MiniDumpWriteDump) {
+                FreeLibrary(hToolHelpModule);
+                return false;
+            }
+        }
+        else
+            return false;
+
+        // Generate a decent filename.
+        nv::Path application_path(256);
+        HINSTANCE hinstance = GetModuleHandle(NULL);
+        GetModuleFileName(hinstance, application_path.str(), 256);
+        application_path.stripExtension();
+        const char * application_name = application_path.fileName();
+
+        SYSTEMTIME local_time;
+        GetLocalTime(&local_time);
+
+        char dump_filename[MAX_PATH] = {};
+        sprintf_s(dump_filename, "d:\\%s-%04d%02d%02d-%02d%02d%02d.dmp",
+            application_name,
+            local_time.wYear, local_time.wMonth, local_time.wDay,
+            local_time.wHour, local_time.wMinute, local_time.wSecond );
+#else
+        const char* dump_filename = "crash.dmp";
+#endif
+
+        // create the file
+        HANDLE hFile = CreateFileA(dump_filename, GENERIC_READ | GENERIC_WRITE,
+            FILE_SHARE_WRITE | FILE_SHARE_READ, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+        if (hFile == INVALID_HANDLE_VALUE) {
+            //nvDebug("*** Failed to create dump file.\n");
+#if NV_OS_DURANGO
+            FreeLibrary(hToolHelpModule);
+#endif
+            return false;
+        }
+
+        MINIDUMP_EXCEPTION_INFORMATION * pExInfo = NULL;
+#if NV_OS_WIN32
+        MINIDUMP_CALLBACK_INFORMATION * pCallback = NULL;
+#else
+        void * pCallback = NULL;
+#endif
+
+        MINIDUMP_EXCEPTION_INFORMATION ExInfo;
+        if (pExceptionInfo != NULL) {
+            ExInfo.ThreadId = ::GetCurrentThreadId();
+            ExInfo.ExceptionPointers = pExceptionInfo;
+            ExInfo.ClientPointers = NULL;
+            pExInfo = &ExInfo;
+
+#if NV_OS_WIN32
+            MINIDUMP_CALLBACK_INFORMATION callback;
+            MinidumpCallbackContext context;
+
+            // Find a memory region of 256 bytes centered on the
+            // faulting instruction pointer.
+            const ULONG64 instruction_pointer = 
+            #if defined(_M_IX86)
+                pExceptionInfo->ContextRecord->Eip;
+            #elif defined(_M_AMD64)
+                pExceptionInfo->ContextRecord->Rip;
+            #else
+                #error Unsupported platform
+            #endif
+
+            MEMORY_BASIC_INFORMATION info;
+            
+            if (VirtualQuery(reinterpret_cast<LPCVOID>(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT)
+            {
+                // Attempt to get 128 bytes before and after the instruction
+                // pointer, but settle for whatever's available up to the
+                // boundaries of the memory region.
+                const ULONG64 kIPMemorySize = 256;
+                context.memory_base = max(reinterpret_cast<ULONG64>(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2));
+                ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast<ULONG64>(info.BaseAddress) + info.RegionSize);
+                context.memory_size = static_cast<ULONG>(end_of_range - context.memory_base);
+                context.finished = false;
+
+                callback.CallbackRoutine = miniDumpWriteDumpCallback;
+                callback.CallbackParam = reinterpret_cast<void*>(&context);
+                pCallback = &callback;
+            }
+#endif
+        }
+
+        MINIDUMP_TYPE miniDumpType = (MINIDUMP_TYPE)(MiniDumpNormal|MiniDumpWithHandleData|MiniDumpWithThreadInfo);
+
+        // write the dump
+        BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, pExInfo, NULL, pCallback) != 0;
+        CloseHandle(hFile);
+#if NV_OS_DURANGO
+        FreeLibrary(hToolHelpModule);
+#endif
+
+        if (ok == FALSE) {
+            //nvDebug("*** Failed to save dump file.\n");
+            return false;
+        }
+
+        //nvDebug("\nDump file saved.\n");
+
+        return true;
+    }
+
+#if NV_USE_SEPARATE_THREAD
+
+    static DWORD WINAPI ExceptionHandlerThreadMain(void* lpParameter) {
+        nvDebugCheck(s_handler_start_semaphore != NULL);
+        nvDebugCheck(s_handler_finish_semaphore != NULL);
+
+        while (true) {
+            if (WaitForSingleObject(s_handler_start_semaphore, INFINITE) == WAIT_OBJECT_0) {
+                writeMiniDump(s_exception_info);
+
+                // Allow the requesting thread to proceed.
+                ReleaseSemaphore(s_handler_finish_semaphore, 1, NULL);
+            }
+        }
+
+        // This statement is not reached when the thread is unconditionally
+        // terminated by the ExceptionHandler destructor.
+        return 0;
+    }
+
+#endif // NV_USE_SEPARATE_THREAD
+
+    static bool hasStackTrace() {
+        return true;
+    }
+
+    /*static NV_NOINLINE int backtrace(void * trace[], int maxcount) {
+
+        // In Windows XP and Windows Server 2003, the sum of the FramesToSkip and FramesToCapture parameters must be less than 63.
+        int xp_maxcount = min(63-1, maxcount);
+
+        int count = RtlCaptureStackBackTrace(1, xp_maxcount, trace, NULL);
+        nvDebugCheck(count <= maxcount);
+
+        return count;
+    }*/
+
+#if NV_OS_WIN32
+    static NV_NOINLINE int backtraceWithSymbols(CONTEXT * ctx, void * trace[], int maxcount, int skip = 0) {
+        
+        // Init the stack frame for this function
+        STACKFRAME64 stackFrame = { 0 };
+
+    #if NV_CPU_X86_64
+        DWORD dwMachineType = IMAGE_FILE_MACHINE_AMD64;
+        stackFrame.AddrPC.Offset = ctx->Rip;
+        stackFrame.AddrFrame.Offset = ctx->Rbp;
+        stackFrame.AddrStack.Offset = ctx->Rsp;
+    #elif NV_CPU_X86
+        DWORD dwMachineType = IMAGE_FILE_MACHINE_I386;
+        stackFrame.AddrPC.Offset = ctx->Eip;
+        stackFrame.AddrFrame.Offset = ctx->Ebp;
+        stackFrame.AddrStack.Offset = ctx->Esp;
+    #else
+        #error "Platform not supported!"
+    #endif
+        stackFrame.AddrPC.Mode = AddrModeFlat;
+        stackFrame.AddrFrame.Mode = AddrModeFlat;
+        stackFrame.AddrStack.Mode = AddrModeFlat;
+
+        // Walk up the stack
+        const HANDLE hThread = GetCurrentThread();
+        const HANDLE hProcess = GetCurrentProcess();
+        int i;
+        for (i = 0; i < maxcount; i++)
+        {
+            // walking once first makes us skip self
+            if (!StackWalk64(dwMachineType, hProcess, hThread, &stackFrame, ctx, NULL, &SymFunctionTableAccess64, &SymGetModuleBase64, NULL)) {
+                break;
+            }
+
+            /*if (stackFrame.AddrPC.Offset == stackFrame.AddrReturn.Offset || stackFrame.AddrPC.Offset == 0) {
+                break;
+            }*/
+
+            if (i >= skip) {
+                trace[i - skip] = (PVOID)stackFrame.AddrPC.Offset;
+            }
+        }
+
+        return i - skip;
+    }
+
+#pragma warning(push)
+#pragma warning(disable:4748)
+    static NV_NOINLINE int backtrace(void * trace[], int maxcount) {
+        CONTEXT ctx = { 0 };
+// -- GODOT start --
+#if NV_CPU_X86 && !NV_CPU_X86_64
+        ctx.ContextFlags = CONTEXT_CONTROL;
+#if NV_CC_MSVC
+        _asm {
+             call x
+          x: pop eax
+             mov ctx.Eip, eax
+             mov ctx.Ebp, ebp
+             mov ctx.Esp, esp
+        }
+#else
+        register long unsigned int ebp asm("ebp");
+        ctx.Eip = (DWORD) __builtin_return_address(0);
+        ctx.Ebp = ebp;
+        ctx.Esp = (DWORD) __builtin_frame_address(0);
+#endif
+// -- GODOT end --
+#else
+        RtlCaptureContext(&ctx); // Not implemented correctly in x86.
+#endif
+
+        return backtraceWithSymbols(&ctx, trace, maxcount, 1);
+    }
+#pragma warning(pop)
+
+    static NV_NOINLINE void writeStackTrace(void * trace[], int size, int start, Array<const char *> & lines)
+    {
+        StringBuilder builder(512);
+
+        HANDLE hProcess = GetCurrentProcess();
+        
+        // Resolve PC to function names
+        for (int i = start; i < size; i++)
+        {
+            // Check for end of stack walk
+            DWORD64 ip = (DWORD64)trace[i];
+            if (ip == NULL)
+                break;
+
+            // Get function name
+            #define MAX_STRING_LEN  (512)
+            unsigned char byBuffer[sizeof(IMAGEHLP_SYMBOL64) + MAX_STRING_LEN] = { 0 };
+            IMAGEHLP_SYMBOL64 * pSymbol = (IMAGEHLP_SYMBOL64*)byBuffer;
+            pSymbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL64);
+            pSymbol->MaxNameLength = MAX_STRING_LEN;
+
+            DWORD64 dwDisplacement;
+            
+            if (SymGetSymFromAddr64(hProcess, ip, &dwDisplacement, pSymbol))
+            {
+                pSymbol->Name[MAX_STRING_LEN-1] = 0;
+                
+                /*
+                // Make the symbol readable for humans
+                UnDecorateSymbolName( pSym->Name, lpszNonUnicodeUnDSymbol, BUFFERSIZE, 
+                    UNDNAME_COMPLETE | 
+                    UNDNAME_NO_THISTYPE |
+                    UNDNAME_NO_SPECIAL_SYMS |
+                    UNDNAME_NO_MEMBER_TYPE |
+                    UNDNAME_NO_MS_KEYWORDS |
+                    UNDNAME_NO_ACCESS_SPECIFIERS );
+                */
+                
+                // pSymbol->Name
+                const char * pFunc = pSymbol->Name;
+
+                // Get file/line number
+                IMAGEHLP_LINE64 theLine = { 0 };
+                theLine.SizeOfStruct = sizeof(theLine);
+
+                DWORD dwDisplacement;
+                if (!SymGetLineFromAddr64(hProcess, ip, &dwDisplacement, &theLine))
+                {
+                    // Do not print unknown symbols anymore.
+                    //break;
+                    builder.format("unknown(%08X) : %s\n", (uint32)ip, pFunc);
+                }
+                else
+                {
+                    /*
+                    const char* pFile = strrchr(theLine.FileName, '\\');
+                    if ( pFile == NULL ) pFile = theLine.FileName;
+                    else pFile++;
+                    */
+                    const char * pFile = theLine.FileName;
+                    
+                    int line = theLine.LineNumber;
+                    
+                    builder.format("%s(%d) : %s\n", pFile, line, pFunc);
+                }
+
+                lines.append(builder.release());
+
+                if (pFunc != NULL && strcmp(pFunc, "WinMain") == 0) {
+                    break;
+                }
+            }
+        }
+    }
+#endif
+
+    // Write mini dump and print stack trace.
+    static LONG WINAPI handleException(EXCEPTION_POINTERS * pExceptionInfo)
+    {
+        EnterCriticalSection(&s_handler_critical_section);
+#if NV_USE_SEPARATE_THREAD
+        s_requesting_thread_id = GetCurrentThreadId();
+        s_exception_info = pExceptionInfo;
+
+        // This causes the handler thread to call writeMiniDump.
+        ReleaseSemaphore(s_handler_start_semaphore, 1, NULL);
+
+        // Wait until WriteMinidumpWithException is done and collect its return value.
+        WaitForSingleObject(s_handler_finish_semaphore, INFINITE);
+        //bool status = s_handler_return_value;
+
+        // Clean up.
+        s_requesting_thread_id = 0;
+        s_exception_info = NULL;
+#else
+        // First of all, write mini dump.
+        writeMiniDump(pExceptionInfo);
+#endif
+        LeaveCriticalSection(&s_handler_critical_section);
+
+        nvDebug("\nDump file saved.\n");
+
+        // Try to attach to debugger.
+        if (s_interactive && debug::attachToDebugger()) {
+            nvDebugBreak();
+            return EXCEPTION_CONTINUE_EXECUTION;
+        }
+
+#if NV_OS_WIN32
+        // If that fails, then try to pretty print a stack trace and terminate.
+        void * trace[64];
+        
+        int size = backtraceWithSymbols(pExceptionInfo->ContextRecord, trace, 64);
+
+        // @@ Use win32's CreateFile?
+        FILE * fp = fileOpen("crash.txt", "wb");
+        if (fp != NULL) {
+            Array<const char *> lines;
+            writeStackTrace(trace, size, 0, lines);
+
+            for (uint i = 0; i < lines.count(); i++) {
+                fputs(lines[i], fp);
+                delete lines[i];
+            }
+
+            // @@ Add more info to crash.txt?
+
+            fclose(fp);
+        }
+#endif
+
+        // This should terminate the process and set the error exit code.
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 2);
+
+        return EXCEPTION_EXECUTE_HANDLER;   // Terminate app. In case terminate process did not succeed.
+    }
+
+    static void handlePureVirtualCall() {
+        nvDebugBreak();
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8);
+    }
+
+    static void handleInvalidParameter(const wchar_t * wexpresion, const wchar_t * wfunction, const wchar_t * wfile, unsigned int line, uintptr_t reserved) {
+
+        size_t convertedCharCount = 0;
+        
+        StringBuilder expresion;
+        if (wexpresion != NULL) {
+            uint size = U32(wcslen(wexpresion) + 1);
+            expresion.reserve(size);
+            wcstombs_s(&convertedCharCount, expresion.str(), size, wexpresion, _TRUNCATE);
+        }
+
+        StringBuilder file;
+        if (wfile != NULL) {
+            uint size = U32(wcslen(wfile) + 1);
+            file.reserve(size);
+            wcstombs_s(&convertedCharCount, file.str(), size, wfile, _TRUNCATE);
+        }
+
+        StringBuilder function;
+        if (wfunction != NULL) {
+            uint size = U32(wcslen(wfunction) + 1);
+            function.reserve(size);
+            wcstombs_s(&convertedCharCount, function.str(), size, wfunction, _TRUNCATE);
+        }
+        
+        int result = nvAbort(expresion.str(), file.str(), line, function.str());
+        if (result == NV_ABORT_DEBUG) {
+            nvDebugBreak();
+        } 
+    }
+
+#elif !NV_OS_WIN32 && defined(NV_HAVE_SIGNAL_H) // NV_OS_LINUX || NV_OS_DARWIN
+
+#if defined(NV_HAVE_EXECINFO_H)
+
+    static bool hasStackTrace() {
+        return true;
+    }
+
+
+    static void writeStackTrace(void * trace[], int size, int start, Array<const char *> & lines) {
+        StringBuilder builder(512);
+        char ** string_array = backtrace_symbols(trace, size);
+
+        for(int i = start; i < size-1; i++ ) {
+            // IC: Just in case.
+            if (string_array[i] == NULL || string_array[i][0] == '\0') break;
+
+#       if NV_CC_GNUC // defined(NV_HAVE_CXXABI_H)
+            // @@ Write a better parser for the possible formats.
+            char * begin = strchr(string_array[i], '(');
+            char * end = strrchr(string_array[i], '+');
+            char * module = string_array[i];
+
+            if (begin == 0 && end != 0) {
+                *(end - 1) = '\0';
+                begin = strrchr(string_array[i], ' ');
+                module = NULL; // Ignore module.
+            }
+
+            if (begin != 0 && begin < end) {
+                int stat;
+                *end = '\0';
+                *begin = '\0';
+                char * name = abi::__cxa_demangle(begin+1, 0, 0, &stat);
+                if (module == NULL) {
+                    if (name == NULL || stat != 0) {
+                        builder.format("  In: '%s'\n", begin+1);
+                    }
+                    else {
+                        builder.format("  In: '%s'\n", name);
+                    }
+                }
+                else {
+                    if (name == NULL || stat != 0) {
+                        builder.format("  In: [%s] '%s'\n", module, begin+1);
+                    }
+                    else {
+                        builder.format("  In: [%s] '%s'\n", module, name);
+                    }
+                }
+                free(name);
+            }
+            else {
+                builder.format("  In: '%s'\n", string_array[i]);
+            }
+#       else
+            builder.format("  In: '%s'\n", string_array[i]);
+#       endif
+            lines.append(builder.release());
+        }
+
+        free(string_array);
+    }
+
+    static void printStackTrace(void * trace[], int size, int start=0) {
+        nvDebug( "\nDumping stacktrace:\n" );
+
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 1, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            nvDebug("%s", lines[i]);
+            delete lines[i];
+        }
+
+        nvDebug("\n");
+    }
+
+#endif // defined(NV_HAVE_EXECINFO_H)
+
+    static void * callerAddress(void * secret)
+    {
+#if NV_OS_DARWIN
+#  if defined(_STRUCT_MCONTEXT)
+#    if NV_CPU_PPC
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__srr0;
+#    elif NV_CPU_X86_64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__rip;
+#    elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__eip;
+#    elif NV_CPU_ARM
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__pc;
+#    else
+#      error "Unknown CPU"
+#    endif
+#  else
+#    if NV_CPU_PPC
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->ss.srr0;
+#    elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->ss.eip;
+#    else
+#      error "Unknown CPU"
+#    endif
+#  endif
+#elif NV_OS_FREEBSD
+#  if NV_CPU_X86_64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.mc_rip;
+#  elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.mc_eip;
+#    else
+#      error "Unknown CPU"
+#    endif
+#elif NV_OS_OPENBSD
+#  if NV_CPU_X86_64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->sc_rip;
+#  elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->sc_eip;
+#  else
+#       error "Unknown CPU"
+#  endif        
+#else
+#  if NV_CPU_X86_64
+        // #define REG_RIP REG_INDEX(rip) // seems to be 16
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.gregs[REG_RIP];
+#  elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.gregs[14/*REG_EIP*/];
+#  elif NV_CPU_PPC
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext.regs->nip;
+#    else
+#      error "Unknown CPU"
+#    endif
+#endif
+
+        // How to obtain the instruction pointers in different platforms, from mlton's source code.
+        // http://mlton.org/
+        // OpenBSD && NetBSD
+        // ucp->sc_eip
+        // FreeBSD:
+        // ucp->uc_mcontext.mc_eip
+        // HPUX:
+        // ucp->uc_link
+        // Solaris:
+        // ucp->uc_mcontext.gregs[REG_PC]
+        // Linux hppa:
+        // uc->uc_mcontext.sc_iaoq[0] & ~0x3UL
+        // Linux sparc:
+        // ((struct sigcontext*) secret)->sigc_regs.tpc
+        // Linux sparc64:
+        // ((struct sigcontext*) secret)->si_regs.pc
+
+        // potentially correct for other archs:
+        // Linux alpha: ucp->m_context.sc_pc
+        // Linux arm: ucp->m_context.ctx.arm_pc
+        // Linux ia64: ucp->m_context.sc_ip & ~0x3UL
+        // Linux mips: ucp->m_context.sc_pc
+        // Linux s390: ucp->m_context.sregs->regs.psw.addr
+    }
+
+    static void nvSigHandler(int sig, siginfo_t *info, void *secret)
+    {
+        void * pnt = callerAddress(secret);
+
+        // Do something useful with siginfo_t
+        if (sig == SIGSEGV) {
+            if (pnt != NULL) nvDebug("Got signal %d, faulty address is %p, from %p\n", sig, info->si_addr, pnt);
+            else nvDebug("Got signal %d, faulty address is %p\n", sig, info->si_addr);
+        }
+        else if(sig == SIGTRAP) {
+            nvDebug("Breakpoint hit.\n");
+        }
+        else {
+            nvDebug("Got signal %d\n", sig);
+        }
+
+#if defined(NV_HAVE_EXECINFO_H)
+        if (hasStackTrace()) // in case of weak linking
+        {
+            void * trace[64];
+            int size = backtrace(trace, 64);
+
+            if (pnt != NULL) {
+                // Overwrite sigaction with caller's address.
+                trace[1] = pnt;
+            }
+
+            printStackTrace(trace, size, 1);
+        }
+#endif // defined(NV_HAVE_EXECINFO_H)
+
+        exit(0);
+    }
+
+#endif // defined(NV_HAVE_SIGNAL_H)
+
+
+
+#if NV_OS_WIN32 //&& NV_CC_MSVC
+
+    /** Win32 assert handler. */
+    struct Win32AssertHandler : public AssertHandler 
+    {
+        // Flush the message queue. This is necessary for the message box to show up.
+        static void flushMessageQueue()
+        {
+            MSG msg;
+            while( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE ) ) {
+                //if( msg.message == WM_QUIT ) break;
+                TranslateMessage( &msg );
+                DispatchMessage( &msg );
+            }
+        }
+
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            int ret = NV_ABORT_EXIT;
+
+            StringBuilder error_string;
+            error_string.format("*** Assertion failed: %s\n    On file: %s\n    On line: %d\n", exp, file, line );
+            if (func != NULL) {
+                error_string.appendFormat("    On function: %s\n", func);
+            }
+            if (msg != NULL) {
+                error_string.append("    Message: ");
+                va_list tmp;
+                va_copy(tmp, arg);
+                error_string.appendFormatList(msg, tmp);
+                va_end(tmp);
+                error_string.append("\n");
+            }
+            nvDebug( error_string.str() );
+
+            // Print stack trace:
+            debug::dumpInfo();
+
+            if (debug::isDebuggerPresent()) {
+                return NV_ABORT_DEBUG;
+            }
+
+            if (s_interactive) {
+                flushMessageQueue();
+                int action = MessageBoxA(NULL, error_string.str(), "Assertion failed", MB_ABORTRETRYIGNORE | MB_ICONERROR | MB_TOPMOST);
+                switch( action ) {
+                case IDRETRY:
+                    ret = NV_ABORT_DEBUG;
+                    break;
+                case IDIGNORE:
+                    ret = NV_ABORT_IGNORE;
+                    break;
+                case IDABORT:
+                default:
+                    ret = NV_ABORT_EXIT;
+                    break;
+                }
+                /*if( _CrtDbgReport( _CRT_ASSERT, file, line, module, exp ) == 1 ) {
+                    return NV_ABORT_DEBUG;
+                }*/
+            }
+
+            if (ret == NV_ABORT_EXIT) {
+                // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
+            }
+
+            return ret;
+        }
+    };
+#elif NV_OS_XBOX
+
+    /** Xbox360 assert handler. */
+    struct Xbox360AssertHandler : public AssertHandler 
+    {
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            int ret = NV_ABORT_EXIT;
+
+            StringBuilder error_string;
+            if( func != NULL ) {
+                error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+                nvDebug( error_string.str() );
+            }
+            else {
+                error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+                nvDebug( error_string.str() );
+            }
+
+            if (debug::isDebuggerPresent()) {
+                return NV_ABORT_DEBUG;
+            }
+
+            if( ret == NV_ABORT_EXIT ) {
+                 // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
+            }
+
+            return ret;
+        }
+    };
+#elif NV_OS_ORBIS || NV_OS_DURANGO
+
+    /** Console assert handler. */
+    struct ConsoleAssertHandler : public AssertHandler
+    {
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            if( func != NULL ) {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+            }
+            else {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+            }
+
+            //SBtodoORBIS print stack trace
+            /*if (hasStackTrace())
+            {
+                void * trace[64];
+                int size = backtrace(trace, 64);
+                printStackTrace(trace, size, 2);
+            }*/
+            
+            if (debug::isDebuggerPresent())
+                return NV_ABORT_DEBUG;
+
+            return NV_ABORT_IGNORE;
+        }
+    };
+
+#else
+
+    /** Unix assert handler. */
+    struct UnixAssertHandler : public AssertHandler
+    {
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            int ret = NV_ABORT_EXIT;            
+            
+            if( func != NULL ) {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+            }
+            else {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+            }
+
+#if _DEBUG
+            if (debug::isDebuggerPresent()) {
+                return NV_ABORT_DEBUG;
+            }
+#endif
+
+#if defined(NV_HAVE_EXECINFO_H)
+            if (hasStackTrace())
+            {
+                void * trace[64];
+                int size = backtrace(trace, 64);
+                printStackTrace(trace, size, 2);
+            }
+#endif
+
+            if( ret == NV_ABORT_EXIT ) {
+                // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
+            }
+            
+            return ret;
+        }
+    };
+
+#endif
+
+} // namespace
+
+
+/// Handle assertion through the assert handler.
+int nvAbort(const char * exp, const char * file, int line, const char * func/*=NULL*/, const char * msg/*= NULL*/, ...)
+{
+#if NV_OS_WIN32 //&& NV_CC_MSVC
+    static Win32AssertHandler s_default_assert_handler;
+#elif NV_OS_XBOX
+    static Xbox360AssertHandler s_default_assert_handler;
+#elif NV_OS_ORBIS || NV_OS_DURANGO
+    static ConsoleAssertHandler s_default_assert_handler;
+#else
+    static UnixAssertHandler s_default_assert_handler;
+#endif
+
+    va_list arg;
+    va_start(arg,msg);
+
+    AssertHandler * handler = s_assert_handler != NULL ? s_assert_handler : &s_default_assert_handler;
+    int result = handler->assertion(exp, file, line, func, msg, arg);
+
+    va_end(arg);
+
+    return result;
+}
+
+// Abnormal termination. Create mini dump and output call stack.
+void debug::terminate(int code)
+{
+#if NV_OS_WIN32 || NV_OS_DURANGO
+    EnterCriticalSection(&s_handler_critical_section);
+
+    writeMiniDump(NULL);
+
+#if NV_OS_WIN32
+    const int max_stack_size = 64;
+    void * trace[max_stack_size];
+    int size = backtrace(trace, max_stack_size);
+
+    // @@ Use win32's CreateFile?
+    FILE * fp = fileOpen("crash.txt", "wb");
+    if (fp != NULL) {
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 0, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            fputs(lines[i], fp);
+            delete lines[i];
+        }
+
+        // @@ Add more info to crash.txt?
+
+        fclose(fp);
+    }
+#endif
+
+    LeaveCriticalSection(&s_handler_critical_section);
+#endif
+
+    exit(code);
+}
+
+
+/// Shows a message through the message handler.
+void NV_CDECL nvDebugPrint(const char *msg, ...)
+{
+    va_list arg;
+    va_start(arg,msg);
+    if (s_message_handler != NULL) {
+        s_message_handler->log( msg, arg );
+    }
+    else {
+        vprintf(msg, arg);
+    }
+    va_end(arg);
+}
+
+
+/// Dump debug info.
+void debug::dumpInfo()
+{
+#if (NV_OS_WIN32 && NV_CC_MSVC) || (defined(NV_HAVE_SIGNAL_H) && defined(NV_HAVE_EXECINFO_H))
+    if (hasStackTrace())
+    {
+        void * trace[64];
+        int size = backtrace(trace, 64);
+
+        nvDebug( "\nDumping stacktrace:\n" );
+
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 1, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            nvDebug("%s", lines[i]);
+            delete lines[i];
+        }
+    }
+#endif
+}
+
+/// Dump callstack using the specified handler.
+void debug::dumpCallstack(MessageHandler *messageHandler, int callstackLevelsToSkip /*= 0*/)
+{
+#if (NV_OS_WIN32 && NV_CC_MSVC) || (defined(NV_HAVE_SIGNAL_H) && defined(NV_HAVE_EXECINFO_H))
+    if (hasStackTrace())
+    {
+        void * trace[64];
+        int size = backtrace(trace, 64);
+
+        Array<const char *> lines;
+        writeStackTrace(trace, size, callstackLevelsToSkip + 1, lines);     // + 1 to skip the call to dumpCallstack
+
+        for (uint i = 0; i < lines.count(); i++) {
+            messageHandler->log(lines[i], NULL);
+            delete lines[i];
+        }
+    }
+#endif
+}
+
+
+/// Set the debug message handler.
+void debug::setMessageHandler(MessageHandler * message_handler)
+{
+    s_message_handler = message_handler;
+}
+
+/// Reset the debug message handler.
+void debug::resetMessageHandler()
+{
+    s_message_handler = NULL;
+}
+
+/// Set the assert handler.
+void debug::setAssertHandler(AssertHandler * assert_handler)
+{
+    s_assert_handler = assert_handler;
+}
+
+/// Reset the assert handler.
+void debug::resetAssertHandler()
+{
+    s_assert_handler = NULL;
+}
+
+#if NV_OS_WIN32 || NV_OS_DURANGO
+#if NV_USE_SEPARATE_THREAD
+
+static void initHandlerThread()
+{
+    static const int kExceptionHandlerThreadInitialStackSize = 64 * 1024;
+
+    // Set synchronization primitives and the handler thread.  Each
+    // ExceptionHandler object gets its own handler thread because that's the
+    // only way to reliably guarantee sufficient stack space in an exception,
+    // and it allows an easy way to get a snapshot of the requesting thread's
+    // context outside of an exception.
+    InitializeCriticalSection(&s_handler_critical_section);
+    
+    s_handler_start_semaphore = CreateSemaphoreExW(NULL, 0, 1, NULL, 0,
+        SEMAPHORE_MODIFY_STATE | DELETE | SYNCHRONIZE);
+    nvDebugCheck(s_handler_start_semaphore != NULL);
+
+    s_handler_finish_semaphore = CreateSemaphoreExW(NULL, 0, 1, NULL, 0,
+        SEMAPHORE_MODIFY_STATE | DELETE | SYNCHRONIZE);
+    nvDebugCheck(s_handler_finish_semaphore != NULL);
+
+    // Don't attempt to create the thread if we could not create the semaphores.
+    if (s_handler_finish_semaphore != NULL && s_handler_start_semaphore != NULL) {
+        DWORD thread_id;
+        s_handler_thread = CreateThread(NULL,         // lpThreadAttributes
+                                        kExceptionHandlerThreadInitialStackSize,
+                                        ExceptionHandlerThreadMain,
+                                        NULL,         // lpParameter
+                                        0,            // dwCreationFlags
+                                        &thread_id);
+        nvDebugCheck(s_handler_thread != NULL);
+    }
+
+    /* @@ We should avoid loading modules in the exception handler!
+    dbghelp_module_ = LoadLibrary(L"dbghelp.dll");
+    if (dbghelp_module_) {
+        minidump_write_dump_ = reinterpret_cast<MiniDumpWriteDump_type>(GetProcAddress(dbghelp_module_, "MiniDumpWriteDump"));
+    }
+    */
+}
+
+static void shutHandlerThread() {
+    // @@ Free stuff. Terminate thread.
+}
+
+#endif // NV_USE_SEPARATE_THREAD
+#endif // NV_OS_WIN32
+
+
+// Enable signal handler.
+void debug::enableSigHandler(bool interactive)
+{
+    if (s_sig_handler_enabled) return;
+
+    s_sig_handler_enabled = true;
+    s_interactive = interactive;
+
+#if (NV_OS_WIN32 && NV_CC_MSVC) || NV_OS_DURANGO
+    if (interactive) {
+#if NV_OS_WIN32
+        // Do not display message boxes on error.
+        // http://msdn.microsoft.com/en-us/library/windows/desktop/ms680621(v=vs.85).aspx
+        SetErrorMode(SEM_FAILCRITICALERRORS|SEM_NOGPFAULTERRORBOX|SEM_NOOPENFILEERRORBOX);
+#endif
+
+        // CRT reports errors to debug output only.
+        // http://msdn.microsoft.com/en-us/library/1y71x448(v=vs.80).aspx
+        _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_DEBUG);
+        _CrtSetReportMode(_CRT_ERROR, _CRTDBG_MODE_DEBUG);
+        _CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_DEBUG);
+    }
+
+
+#if NV_USE_SEPARATE_THREAD
+    initHandlerThread();
+#else
+    InitializeCriticalSection(&s_handler_critical_section);
+#endif
+
+    s_old_exception_filter = ::SetUnhandledExceptionFilter( handleException );
+
+#if _MSC_VER >= 1400  // MSVC 2005/8
+    _set_invalid_parameter_handler(handleInvalidParameter);
+#endif  // _MSC_VER >= 1400
+
+    _set_purecall_handler(handlePureVirtualCall);
+
+#if NV_OS_WIN32
+    // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces
+    SymSetOptions(SYMOPT_DEFERRED_LOADS|SYMOPT_FAIL_CRITICAL_ERRORS|SYMOPT_LOAD_LINES|SYMOPT_UNDNAME);
+
+    if (!SymInitialize(GetCurrentProcess(), NULL, TRUE)) {
+        DWORD error = GetLastError();
+        nvDebug("SymInitialize returned error : %d\n", error);
+    }
+#endif
+
+#elif !NV_OS_WIN32 && defined(NV_HAVE_SIGNAL_H)
+
+    // Install our signal handler
+    struct sigaction sa;
+    sa.sa_sigaction = nvSigHandler;
+    sigemptyset (&sa.sa_mask);
+    sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
+
+    sigaction(SIGSEGV, &sa, &s_old_sigsegv);
+    sigaction(SIGTRAP, &sa, &s_old_sigtrap);
+    sigaction(SIGFPE, &sa, &s_old_sigfpe);
+    sigaction(SIGBUS, &sa, &s_old_sigbus);
+
+#endif
+}
+
+/// Disable signal handler.
+void debug::disableSigHandler()
+{
+    nvCheck(s_sig_handler_enabled == true);
+    s_sig_handler_enabled = false;
+
+#if (NV_OS_WIN32 && NV_CC_MSVC) || NV_OS_DURANGO
+
+    ::SetUnhandledExceptionFilter( s_old_exception_filter );
+    s_old_exception_filter = NULL;
+
+#if NV_OS_WIN32
+    SymCleanup(GetCurrentProcess());
+#endif
+
+#elif !NV_OS_WIN32 && defined(NV_HAVE_SIGNAL_H)
+
+    sigaction(SIGSEGV, &s_old_sigsegv, NULL);
+    sigaction(SIGTRAP, &s_old_sigtrap, NULL);
+    sigaction(SIGFPE, &s_old_sigfpe, NULL);
+    sigaction(SIGBUS, &s_old_sigbus, NULL);
+
+#endif
+}
+
+
+bool debug::isDebuggerPresent()
+{
+#if NV_OS_WIN32
+    HINSTANCE kernel32 = GetModuleHandleA("kernel32.dll");
+    if (kernel32) {
+        FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent");
+        if (IsDebuggerPresent != NULL && IsDebuggerPresent()) {
+            return true;
+        }
+    }
+    return false;
+#elif NV_OS_XBOX
+#ifdef _DEBUG
+    return DmIsDebuggerPresent() == TRUE;
+#else
+    return false;
+#endif
+#elif NV_OS_ORBIS
+  #if PS4_FINAL_REQUIREMENTS
+    return false; 
+  #else
+    return sceDbgIsDebuggerAttached() == 1;
+  #endif
+#elif NV_OS_DURANGO
+  #if XB1_FINAL_REQUIREMENTS
+    return false;
+  #else
+    return IsDebuggerPresent() == TRUE;
+  #endif
+#elif NV_OS_DARWIN
+    int mib[4];
+    struct kinfo_proc info;
+    size_t size;
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_PID;
+    mib[3] = getpid();
+    size = sizeof(info);
+    info.kp_proc.p_flag = 0;
+    sysctl(mib,4,&info,&size,NULL,0);
+    return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
+#else
+    // if ppid != sid, some process spawned our app, probably a debugger. 
+    return getsid(getpid()) != getppid();
+#endif
+}
+
+bool debug::attachToDebugger()
+{
+#if NV_OS_WIN32
+    if (isDebuggerPresent() == FALSE) {
+        Path process(1024);
+        process.copy("\"");
+        GetSystemDirectoryA(process.str() + 1, 1024 - 1);
+
+        process.appendSeparator();
+
+        process.appendFormat("VSJitDebugger.exe\" -p %lu", ::GetCurrentProcessId());
+
+        STARTUPINFOA sSi;
+        memset(&sSi, 0, sizeof(sSi));
+
+        PROCESS_INFORMATION sPi;
+        memset(&sPi, 0, sizeof(sPi));
+        
+        BOOL b = CreateProcessA(NULL, process.str(), NULL, NULL, FALSE, 0, NULL, NULL, &sSi, &sPi);
+        if (b != FALSE) {
+            ::WaitForSingleObject(sPi.hProcess, INFINITE);
+            
+            DWORD dwExitCode;
+            ::GetExitCodeProcess(sPi.hProcess, &dwExitCode);
+            if (dwExitCode != 0) //if exit code is zero, a debugger was selected
+                b = FALSE;
+        }
+
+        if (sPi.hThread != NULL) ::CloseHandle(sPi.hThread);
+        if (sPi.hProcess != NULL) ::CloseHandle(sPi.hProcess);
+
+        if (b == FALSE)
+            return false;
+
+        for (int i = 0; i < 5*60; i++) {
+            if (isDebuggerPresent())
+                break;
+            ::Sleep(200);
+        }
+    }
+#endif // NV_OS_WIN32
+
+    return true;
+}
diff --git a/thirdparty/thekla_atlas/nvcore/Debug.h b/thirdparty/thekla_atlas/nvcore/Debug.h
new file mode 100644
index 0000000000..f37a05c453
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Debug.h
@@ -0,0 +1,246 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_DEBUG_H
+#define NV_CORE_DEBUG_H
+
+#include "nvcore.h"
+
+#include <stdarg.h> // va_list
+
+#if NV_OS_IOS //ACS: maybe we want this for OSX too?
+#   ifdef __APPLE__
+#       include <TargetConditionals.h>
+#       include <signal.h>
+#   endif
+#endif
+
+// Make sure we are using our assert.
+#undef assert
+
+#define NV_ABORT_DEBUG      1
+#define NV_ABORT_IGNORE     2
+#define NV_ABORT_EXIT       3
+
+#define nvNoAssert(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    (void)sizeof(exp); \
+    NV_MULTI_LINE_MACRO_END
+
+#if NV_NO_ASSERT
+
+#   define nvAssert(exp) nvNoAssert(exp)
+#   define nvCheck(exp) nvNoAssert(exp)
+#   define nvDebugAssert(exp) nvNoAssert(exp)
+#   define nvDebugCheck(exp) nvNoAssert(exp)
+#   define nvDebugBreak() nvNoAssert(0)
+
+#else // NV_NO_ASSERT
+
+#   if NV_CC_MSVC
+        // @@ Does this work in msvc-6 and earlier?
+#       define nvDebugBreak()       __debugbreak()
+//#       define nvDebugBreak()        __asm { int 3 }
+#   elif NV_OS_ORBIS
+#       define nvDebugBreak()       __debugbreak()
+#   elif NV_OS_IOS && TARGET_OS_IPHONE
+#       define nvDebugBreak()       raise(SIGINT)
+#   elif NV_CC_CLANG
+#       define nvDebugBreak()       __builtin_debugtrap()
+#   elif NV_CC_GNUC
+//#       define nvDebugBreak()       __builtin_debugtrap()     // Does GCC have debugtrap?
+#       define nvDebugBreak()		__builtin_trap()
+/*
+#   elif NV_CC_GNUC && NV_CPU_PPC && NV_OS_DARWIN
+// @@ Use __builtin_trap() on GCC
+#       define nvDebugBreak()       __asm__ volatile ("trap")
+#   elif NV_CC_GNUC && NV_CPU_X86 && NV_OS_DARWIN
+#       define nvDebugBreak()       __asm__ volatile ("int3")
+#   elif NV_CC_GNUC && NV_CPU_X86 
+#       define nvDebugBreak()       __asm__ ( "int %0" : :"I"(3) )
+#   elif NV_OS_ORBIS
+#       define nvDebugBreak()       __asm volatile ("int $0x41")
+#   else
+#       include <signal.h>
+#       define nvDebugBreak()       raise(SIGTRAP); 
+// define nvDebugBreak()        *((int *)(0)) = 0
+*/
+#   endif
+
+#  if NV_CC_MSVC
+#   define nvExpect(expr) (expr)
+#else
+#   define nvExpect(expr) __builtin_expect((expr) != 0, true)
+#endif
+
+#if NV_CC_CLANG 
+#   if __has_feature(attribute_analyzer_noreturn)
+#       define NV_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#   else
+#       define NV_ANALYZER_NORETURN
+#   endif
+#else
+#   define NV_ANALYZER_NORETURN
+#endif
+
+#define nvDebugBreakOnce() \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    static bool firstTime = true; \
+    if (firstTime) { firstTime = false; nvDebugBreak(); } \
+    NV_MULTI_LINE_MACRO_END
+
+#define nvAssertMacro(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    if (!nvExpect(exp)) { \
+        if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \
+            nvDebugBreak(); \
+        } \
+    } \
+    NV_MULTI_LINE_MACRO_END
+
+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
+#define nvAssertMacroWithIgnoreAll(exp,...) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+        static bool ignoreAll = false; \
+        if (!ignoreAll && !nvExpect(exp)) { \
+            int _result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
+            if (_result == NV_ABORT_DEBUG) { \
+                nvDebugBreak(); \
+            } else if (_result == NV_ABORT_IGNORE) { \
+                ignoreAll = true; \
+            } \
+        } \
+    NV_MULTI_LINE_MACRO_END
+
+// Interesting assert macro from Insomniac:
+// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to
+// Used as follows:
+// if (nvCheck(i < count)) {
+//     normal path
+// } else {
+//     fixup code.
+// }
+// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely.
+#define nvCheckMacro(exp) \
+    (\
+        (exp) ? true : ( \
+            (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \
+        ) \
+    )
+
+
+#define nvAssert(exp)    nvAssertMacro(exp)
+#define nvCheck(exp)     nvAssertMacro(exp)
+
+#if defined(_DEBUG)
+#   define nvDebugAssert(exp)   nvAssertMacro(exp)
+#   define nvDebugCheck(exp)    nvAssertMacro(exp)
+#else // _DEBUG
+#   define nvDebugAssert(exp)   nvNoAssert(exp)
+#   define nvDebugCheck(exp)    nvNoAssert(exp)
+#endif // _DEBUG
+
+#endif // NV_NO_ASSERT
+
+// Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc.
+/*#if !defined(_DEBUG)
+#   if NV_CC_MSVC
+#       define nvAssume(exp)    __assume(exp)
+#   else
+#       define nvAssume(exp)    nvCheck(exp)
+#   endif
+#else
+#   define nvAssume(exp)    nvCheck(exp)
+#endif*/
+
+#if defined(_DEBUG)
+#  if NV_CC_MSVC
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0)
+#  else
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable()
+#  endif
+#else
+#  if NV_CC_MSVC
+#   define nvUnreachable() __assume(0)
+#  else
+#   define nvUnreachable() __builtin_unreachable()
+#  endif
+#endif
+
+#define nvError(x)      nvAbort(x, __FILE__, __LINE__, __FUNC__)
+#define nvWarning(x)    nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
+
+#ifndef NV_DEBUG_PRINT
+#define NV_DEBUG_PRINT 1 //defined(_DEBUG)
+#endif
+
+#if NV_DEBUG_PRINT
+#define nvDebug(...)    nvDebugPrint(__VA_ARGS__)
+#else
+#if NV_CC_MSVC
+#define nvDebug(...)    __noop(__VA_ARGS__)
+#else
+#define nvDebug(...)    ((void)0) // Non-msvc platforms do not evaluate arguments?
+#endif
+#endif
+
+
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6))) NV_ANALYZER_NORETURN;
+NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
+
+namespace nv
+{
+    inline bool isValidPtr(const void * ptr) {
+    #if NV_OS_DARWIN
+        return true;    // IC: Not sure what ranges are OK on OSX.
+    #endif
+        
+    #if NV_CPU_X86_64
+        if (ptr == NULL) return true;
+        if (reinterpret_cast<uint64>(ptr) < 0x10000ULL) return false;
+        if (reinterpret_cast<uint64>(ptr) >= 0x000007FFFFFEFFFFULL) return false;
+    #else
+	    if (reinterpret_cast<uint32>(ptr) == 0xcccccccc) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xcdcdcdcd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xdddddddd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xffffffff) return false;
+    #endif
+        return true;
+    }
+
+    // Message handler interface.
+    struct MessageHandler {
+        virtual void log(const char * str, va_list arg) = 0;
+        virtual ~MessageHandler() {}
+    };
+
+    // Assert handler interface.
+    struct AssertHandler {
+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
+        virtual ~AssertHandler() {}
+    };
+
+
+    namespace debug
+    {
+        NVCORE_API void dumpInfo();
+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
+
+        NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
+        NVCORE_API void resetMessageHandler();
+
+        NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
+        NVCORE_API void resetAssertHandler();
+
+        NVCORE_API void enableSigHandler(bool interactive);
+        NVCORE_API void disableSigHandler();
+
+        NVCORE_API bool isDebuggerPresent();
+        NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_DEBUG_H
diff --git a/thirdparty/thekla_atlas/nvcore/DefsGnucDarwin.h b/thirdparty/thekla_atlas/nvcore/DefsGnucDarwin.h
new file mode 100644
index 0000000000..afb21c3d25
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/DefsGnucDarwin.h
@@ -0,0 +1,57 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#	define DLL_EXPORT __attribute__((visibility("default")))
+#	define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#	define DLL_EXPORT
+#	define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline)) inline
+#define NV_DEPRECATED   __attribute__((deprecated))
+#if NV_OS_IOS
+#define NV_THREAD_LOCAL // @@ IC: Looks like iOS does not have support for TLS declarations.
+#else
+#define NV_THREAD_LOCAL __thread
+#endif
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
diff --git a/thirdparty/thekla_atlas/nvcore/DefsGnucLinux.h b/thirdparty/thekla_atlas/nvcore/DefsGnucLinux.h
new file mode 100644
index 0000000000..2126d866f5
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/DefsGnucLinux.h
@@ -0,0 +1,59 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#   define DLL_EXPORT   __attribute__((visibility("default")))
+#   define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#   define DLL_EXPORT
+#   define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#   define NV_CDECL     __attribute__((cdecl))
+#   define NV_STDCALL   __attribute__((stdcall))
+#else
+#   define NV_CDECL 
+#   define NV_STDCALL
+#endif
+
+#define NV_FASTCALL     __attribute__((fastcall))
+//#if __GNUC__ > 3
+// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :(
+#define NV_FORCEINLINE  inline __attribute__((always_inline))
+//#else
+// Some compilers complain that inline and always_inline are redundant.
+//#define NV_FORCEINLINE  __attribute__((always_inline))
+//#endif
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL __thread 
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#   if __GNUC__ >= 2
+#       define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__
+#   else
+#       define __FUNC__ "<unknown>"
+#   endif
+#else
+#   define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
diff --git a/thirdparty/thekla_atlas/nvcore/DefsGnucWin32.h b/thirdparty/thekla_atlas/nvcore/DefsGnucWin32.h
new file mode 100644
index 0000000000..e1c8d6e4f8
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/DefsGnucWin32.h
@@ -0,0 +1,67 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+//#include <cstddef> // size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT	__declspec(dllimport)
+#define DLL_EXPORT	__declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+// -- GODOT start -
+#define NV_FORCEINLINE	__attribute__((always_inline)) inline
+// -- GODOT end -
+#define NV_DEPRECATED   __attribute__((deprecated))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+/*
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned long long	uint64;
+typedef signed long long	int64;
+
+// Aliases
+typedef uint32				uint;
+*/
+
diff --git a/thirdparty/thekla_atlas/nvcore/DefsVcWin32.h b/thirdparty/thekla_atlas/nvcore/DefsVcWin32.h
new file mode 100644
index 0000000000..a915f3791a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/DefsVcWin32.h
@@ -0,0 +1,94 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT __declspec(dllimport)
+#define DLL_EXPORT __declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#define NV_CDECL        __cdecl
+#define NV_STDCALL      __stdcall
+#define NV_FASTCALL     __fastcall
+#define NV_DEPRECATED
+
+#define NV_PURE
+#define NV_CONST
+
+// Set standard function names.
+#if _MSC_VER < 1900
+#	define snprintf _snprintf
+#endif
+#if _MSC_VER < 1500
+#   define vsnprintf _vsnprintf
+#endif
+#if _MSC_VER < 1700
+#   define strtoll _strtoi64
+#   define strtoull _strtoui64
+#endif
+//#define chdir _chdir
+#define getcwd _getcwd 
+
+#if _MSC_VER <= 1600
+#define va_copy(a, b) (a) = (b)
+#endif
+
+#if !defined restrict
+#define restrict
+#endif
+
+// Ignore gcc attributes.
+#define __attribute__(X)
+
+#if !defined __FUNC__
+#define __FUNC__ __FUNCTION__ 
+#endif
+
+#define NV_NOINLINE __declspec(noinline)
+#define NV_FORCEINLINE __forceinline
+
+#define NV_THREAD_LOCAL __declspec(thread)
+
+/*
+// Type definitions
+typedef unsigned char       uint8;
+typedef signed char         int8;
+
+typedef unsigned short      uint16;
+typedef signed short        int16;
+
+typedef unsigned int        uint32;
+typedef signed int          int32;
+
+typedef unsigned __int64    uint64;
+typedef signed __int64      int64;
+
+// Aliases
+typedef uint32              uint;
+*/
+
+// Unwanted VC++ warnings to disable.
+/*
+#pragma warning(disable : 4244)     // conversion to float, possible loss of data
+#pragma warning(disable : 4245)     // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
+#pragma warning(disable : 4100)     // unreferenced formal parameter
+#pragma warning(disable : 4514)     // unreferenced inline function has been removed
+#pragma warning(disable : 4710)     // inline function not expanded
+#pragma warning(disable : 4127)     // Conditional expression is constant
+#pragma warning(disable : 4305)     // truncation from 'const double' to 'float'
+#pragma warning(disable : 4505)     // unreferenced local function has been removed
+
+#pragma warning(disable : 4702)     // unreachable code in inline expanded function
+#pragma warning(disable : 4711)     // function selected for automatic inlining
+#pragma warning(disable : 4725)     // Pentium fdiv bug
+
+#pragma warning(disable : 4786)     // Identifier was truncated and cannot be debugged.
+
+#pragma warning(disable : 4675)     // resolved overload was found by argument-dependent lookup
+*/
+
+#pragma warning(1 : 4705)     // Report unused local variables.
+#pragma warning(1 : 4555)     // Expression has no effect.
diff --git a/thirdparty/thekla_atlas/nvcore/FileSystem.cpp b/thirdparty/thekla_atlas/nvcore/FileSystem.cpp
new file mode 100644
index 0000000000..5ed0ca074f
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/FileSystem.cpp
@@ -0,0 +1,75 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "FileSystem.h"
+
+#if NV_OS_WIN32
+#define _CRT_NONSTDC_NO_WARNINGS // _chdir is defined deprecated, but that's a bug, chdir is deprecated, _chdir is *not*.
+//#include <shlwapi.h> // PathFileExists
+#include <windows.h> // GetFileAttributes
+#include <direct.h> // _mkdir
+#elif NV_OS_XBOX
+#include <Xtl.h>
+#elif NV_OS_DURANGO
+#include <Windows.h>
+#else
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <stdio.h> // remove, unlink
+
+using namespace nv;
+
+
+bool FileSystem::exists(const char * path)
+{
+#if NV_OS_UNIX
+	return access(path, F_OK|R_OK) == 0;
+	//struct stat buf;
+	//return stat(path, &buf) == 0;
+#elif NV_OS_WIN32 || NV_OS_XBOX || NV_OS_DURANGO
+    // PathFileExists requires linking to shlwapi.lib
+    //return PathFileExists(path) != 0;
+    return GetFileAttributesA(path) != INVALID_FILE_ATTRIBUTES;
+#else
+	if (FILE * fp = fopen(path, "r"))
+	{
+		fclose(fp);
+		return true;
+	}
+	return false;
+#endif
+}
+
+bool FileSystem::createDirectory(const char * path)
+{
+#if NV_OS_WIN32 || NV_OS_XBOX || NV_OS_DURANGO
+    return CreateDirectoryA(path, NULL) != 0;
+#elif NV_OS_ORBIS
+    // not implemented
+	return false;
+#else
+    return mkdir(path, 0777) != -1;
+#endif
+}
+
+bool FileSystem::changeDirectory(const char * path)
+{
+#if NV_OS_WIN32
+    return _chdir(path) != -1;
+#elif NV_OS_XBOX || NV_OS_DURANGO
+	// Xbox doesn't support Current Working Directory!
+	return false;
+#elif NV_OS_ORBIS
+    // Orbis doesn't support Current Working Directory!
+	return false;
+#else
+    return chdir(path) != -1;
+#endif
+}
+
+bool FileSystem::removeFile(const char * path)
+{
+    // @@ Use unlink or remove?
+    return remove(path) == 0;
+}
diff --git a/thirdparty/thekla_atlas/nvcore/FileSystem.h b/thirdparty/thekla_atlas/nvcore/FileSystem.h
new file mode 100644
index 0000000000..afd0f449d3
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/FileSystem.h
@@ -0,0 +1,24 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_CORE_FILESYSTEM_H
+#define NV_CORE_FILESYSTEM_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+
+    namespace FileSystem
+    {
+        NVCORE_API bool exists(const char * path);
+        NVCORE_API bool createDirectory(const char * path);
+        NVCORE_API bool changeDirectory(const char * path);
+        NVCORE_API bool removeFile(const char * path);
+
+    } // FileSystem namespace
+
+} // nv namespace
+
+
+#endif // NV_CORE_FILESYSTEM_H
diff --git a/thirdparty/thekla_atlas/nvcore/ForEach.h b/thirdparty/thekla_atlas/nvcore/ForEach.h
new file mode 100644
index 0000000000..bc66f424ef
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/ForEach.h
@@ -0,0 +1,71 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_FOREACH_H
+#define NV_CORE_FOREACH_H
+
+/*
+These foreach macros are very non-standard and somewhat confusing, but I like them.
+*/
+
+#include "nvcore.h"
+
+
+#if NV_CC_CPP11
+
+#define NV_FOREACH(i, container) \
+    for (auto i = (container).start(); !(container).isDone(i); (container).advance(i))
+
+#elif NV_CC_GNUC // If typeof is available:
+
+/*
+Ideally we would like to write this:
+
+#define NV_FOREACH(i, container) \
+    for(decltype(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+But gcc versions prior to 4.7 required an intermediate type. See:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
+*/
+
+#define NV_FOREACH(i, container) \
+    typedef typeof(container) NV_STRING_JOIN2(cont,__LINE__); \
+    for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+#else // If typeof not available:
+
+#define NV_NEED_PSEUDOINDEX_WRAPPER 1
+
+#include <new> // placement new
+
+struct PseudoIndexWrapper {
+    template <typename T>
+    PseudoIndexWrapper(const T & container) {
+        nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
+        new (memory) typename T::PseudoIndex(container.start());
+    }
+    // PseudoIndex cannot have a dtor!
+
+    template <typename T> typename T::PseudoIndex & operator()(const T * /*container*/) {
+        return *reinterpret_cast<typename T::PseudoIndex *>(memory);
+    }
+    template <typename T> const typename T::PseudoIndex & operator()(const T * /*container*/) const {
+        return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
+    }
+
+    uint8 memory[4];	// Increase the size if we have bigger enumerators.
+};
+
+#define NV_FOREACH(i, container) \
+    for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
+
+#endif
+
+// Declare foreach keyword.
+#if !defined NV_NO_USE_KEYWORDS
+#   define foreach NV_FOREACH
+#   define foreach_index NV_FOREACH
+#endif
+
+
+#endif // NV_CORE_FOREACH_H
diff --git a/thirdparty/thekla_atlas/nvcore/Hash.h b/thirdparty/thekla_atlas/nvcore/Hash.h
new file mode 100644
index 0000000000..a8b0b2c63b
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Hash.h
@@ -0,0 +1,83 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_HASH_H
+#define NV_CORE_HASH_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+    inline uint sdbmHash(const void * data_in, uint size, uint h = 5381)
+    {
+        const uint8 * data = (const uint8 *) data_in;
+        uint i = 0;
+        while (i < size) {
+            h = (h << 16) + (h << 6) - h + (uint) data[i++];
+        }
+        return h;
+    }
+
+    // Note that this hash does not handle NaN properly.
+    inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381)
+    {
+        for (uint i = 0; i < count; i++) {
+            //nvDebugCheck(nv::isFinite(*f));
+            union { float f; uint32 i; } x = { f[i] };
+            if (x.i == 0x80000000) x.i = 0;
+            h = sdbmHash(&x, 4, h);
+        }
+        return h;
+    }
+
+
+    template <typename T>
+    inline uint hash(const T & t, uint h = 5381)
+    {
+        return sdbmHash(&t, sizeof(T), h);
+    }
+
+    template <>
+    inline uint hash(const float & f, uint h)
+    {
+        return sdbmFloatHash(&f, 1, h);
+    }
+
+
+    // Functors for hash table:
+    template <typename Key> struct Hash 
+    {
+        uint operator()(const Key & k) const {
+            return hash(k);
+        }
+    };
+
+    template <typename Key> struct Equal
+    {
+        bool operator()(const Key & k0, const Key & k1) const {
+            return k0 == k1;
+        }
+    };
+
+
+    // @@ Move to Utils.h?
+    template <typename T1, typename T2>
+    struct Pair {
+        T1 first;
+        T2 second;
+    };
+
+    template <typename T1, typename T2>
+    bool operator==(const Pair<T1,T2> & p0, const Pair<T1,T2> & p1) {
+        return p0.first == p1.first && p0.second == p1.second;
+    }
+
+    template <typename T1, typename T2>
+    uint hash(const Pair<T1,T2> & p, uint h = 5381) {
+        return hash(p.second, hash(p.first));
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_HASH_H
diff --git a/thirdparty/thekla_atlas/nvcore/HashMap.h b/thirdparty/thekla_atlas/nvcore/HashMap.h
new file mode 100644
index 0000000000..7856d6a8c9
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/HashMap.h
@@ -0,0 +1,174 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_HASHMAP_H
+#define NV_CORE_HASHMAP_H
+
+/*
+HashMap based on Thatcher Ulrich <tu@tulrich.com> container, donated to the Public Domain.
+
+I'd like to do something to reduce the amount of code generated with this template. The type of 
+U is largely irrelevant to the generated code, except for calls to constructors and destructors,
+but the combination of all T and U pairs, generate a large amounts of code.
+
+HashMap is not used in NVTT, so it could be removed from the repository.
+*/
+
+
+#include "Memory.h"
+#include "Debug.h"
+#include "ForEach.h"
+#include "Hash.h"
+
+namespace nv 
+{
+    class Stream;
+
+    /** Thatcher Ulrich's hash table.
+    *
+    * Hash table, linear probing, internal chaining.  One
+    * interesting/nice thing about this implementation is that the table
+    * itself is a flat chunk of memory containing no pointers, only
+    * relative indices.  If the key and value types of the hash contain
+    * no pointers, then the hash can be serialized using raw IO.  Could
+    * come in handy.
+    *
+    * Never shrinks, unless you explicitly clear() it.  Expands on
+    * demand, though.  For best results, if you know roughly how big your
+    * table will be, default it to that size when you create it.
+    */
+    template<typename T, typename U, typename H = Hash<T>, typename E = Equal<T> >
+    class NVCORE_CLASS HashMap
+    {
+        NV_FORBID_COPY(HashMap);
+    public:
+
+        /// Default ctor.
+        HashMap() : entry_count(0), size_mask(-1), table(NULL) { }
+
+        /// Ctor with size hint.
+        explicit HashMap(int size_hint) : entry_count(0), size_mask(-1), table(NULL) { setCapacity(size_hint); }
+
+        /// Dtor.
+        ~HashMap() { clear(); }
+
+
+        void set(const T& key, const U& value);
+        void add(const T& key, const U& value);
+        bool remove(const T& key);
+        void clear();
+        bool isEmpty() const;
+        bool get(const T& key, U* value = NULL, T* other_key = NULL) const;
+        bool contains(const T & key) const;
+        int	size() const;
+        int	count() const;
+        int	capacity() const;
+        void checkExpand();
+        void resize(int n);
+
+        void setCapacity(int new_size);
+
+        // Behaves much like std::pair.
+        struct Entry
+        {
+            int	next_in_chain;	// internal chaining for collisions
+            uint hash_value;	// avoids recomputing.  Worthwhile?
+            T key;
+            U value;
+
+            Entry() : next_in_chain(-2) {}
+            Entry(const Entry& e) : next_in_chain(e.next_in_chain), hash_value(e.hash_value), key(e.key), value(e.value) {}
+            Entry(const T& k, const U& v, int next, int hash) : next_in_chain(next), hash_value(hash), key(k), value(v) {}
+            
+            bool isEmpty() const { return next_in_chain == -2; }
+            bool isEndOfChain() const { return next_in_chain == -1; }
+            bool isTombstone() const { return hash_value == TOMBSTONE_HASH; }
+
+            void clear() {
+                key.~T();	// placement delete
+                value.~U();	// placement delete
+                next_in_chain = -2;
+                hash_value = ~TOMBSTONE_HASH;
+            }
+
+            void makeTombstone() {
+                key.~T();
+                value.~U();
+                hash_value = TOMBSTONE_HASH;
+            }
+        };
+
+
+        // HashMap enumerator.
+        typedef int PseudoIndex;
+        PseudoIndex start() const { PseudoIndex i = 0; findNext(i); return i; }
+        bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= size_mask+1); return i == size_mask+1; };
+        void advance(PseudoIndex & i) const { nvDebugCheck(i <= size_mask+1); i++; findNext(i); }
+
+#if NV_NEED_PSEUDOINDEX_WRAPPER
+        Entry & operator[]( const PseudoIndexWrapper & i ) {
+            Entry & e = entry(i(this));
+            nvDebugCheck(e.isTombstone() == false);
+            return e;
+        }
+        const Entry & operator[]( const PseudoIndexWrapper & i ) const {
+            const Entry & e = entry(i(this));
+            nvDebugCheck(e.isTombstone() == false);
+            return e;
+        }
+#else
+        Entry & operator[](const PseudoIndex & i) {
+            Entry & e = entry(i);
+            nvDebugCheck(e.isTombstone() == false);
+            return e;
+        }
+        const Entry & operator[](const PseudoIndex & i) const {
+            const Entry & e = entry(i);
+            nvDebugCheck(e.isTombstone() == false);
+            return e;
+        }
+#endif
+
+
+        // By default we serialize the key-value pairs compactl	y.
+        template<typename _T, typename _U, typename _H, typename _E>
+        friend Stream & operator<< (Stream & s, HashMap<_T, _U, _H, _E> & map);
+
+        // This requires more storage, but saves us from rehashing the elements.
+        template<typename _T, typename _U, typename _H, typename _E>
+        friend Stream & rawSerialize(Stream & s, HashMap<_T, _U, _H, _E> & map);
+
+        /// Swap the members of this vector and the given vector.
+        template<typename _T, typename _U, typename _H, typename _E>
+        friend void swap(HashMap<_T, _U, _H, _E> & a, HashMap<_T, _U, _H, _E> & b);
+	
+    private:
+        static const uint TOMBSTONE_HASH = (uint) -1;
+
+        uint compute_hash(const T& key) const;
+
+        // Find the index of the matching entry. If no match, then return -1.
+        int	findIndex(const T& key) const;
+
+        // Return the index of the newly cleared element.
+        int removeTombstone(int index);
+
+        // Helpers.
+        Entry & entry(int index);
+        const Entry & entry(int index) const;
+
+        void setRawCapacity(int new_size);
+
+        // Move the enumerator to the next valid element.
+        void findNext(PseudoIndex & i) const;
+
+
+        int	entry_count;
+        int	size_mask;
+        Entry * table;
+
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_HASHMAP_H
diff --git a/thirdparty/thekla_atlas/nvcore/HashMap.inl b/thirdparty/thekla_atlas/nvcore/HashMap.inl
new file mode 100644
index 0000000000..f0b6bfea62
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/HashMap.inl
@@ -0,0 +1,550 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_HASHMAP_INL
+#define NV_CORE_HASHMAP_INL
+
+#include "HashMap.h"
+
+#include "Stream.h"
+#include "Utils.h" // swap
+
+#include <new> // for placement new
+
+
+namespace nv 
+{
+
+    // Set a new or existing value under the key, to the value.
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::set(const T& key, const U& value)
+    {
+        int	index = findIndex(key);
+        if (index >= 0)
+        {
+            entry(index).value = value;
+            return;
+        }
+
+        // Entry under key doesn't exist.
+        add(key, value);
+    }
+
+
+    // Add a new value to the hash table, under the specified key.
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::add(const T& key, const U& value)
+    {
+        nvCheck(findIndex(key) == -1);
+
+        checkExpand();
+        nvCheck(table != NULL);
+        entry_count++;
+
+        const uint hash_value = compute_hash(key);
+        const int index = hash_value & size_mask;
+
+        Entry * natural_entry = &(entry(index));
+
+        if (natural_entry->isEmpty())
+        {
+            // Put the new entry in.
+            new (natural_entry) Entry(key, value, -1, hash_value);
+        } 
+        else if (natural_entry->isTombstone()) {
+            // Put the new entry in, without disturbing the rest of the chain.
+            int next_in_chain = natural_entry->next_in_chain;
+            new (natural_entry) Entry(key, value, next_in_chain, hash_value);
+        }
+        else
+        {
+            // Find a blank spot.
+            int	blank_index = index;
+            for (int search_count = 0; ; search_count++)
+            {
+                blank_index = (blank_index + 1) & size_mask;
+                if (entry(blank_index).isEmpty()) break;	// found it
+                if (entry(blank_index).isTombstone()) {
+                    blank_index = removeTombstone(blank_index);
+                    break;
+                }
+                nvCheck(search_count < this->size_mask);
+            }
+            Entry * blank_entry = &entry(blank_index);
+
+            if (int(natural_entry->hash_value & size_mask) == index)
+            {
+                // Collision.  Link into this chain.
+
+                // Move existing list head.
+                new (blank_entry) Entry(*natural_entry);	// placement new, copy ctor
+
+                // Put the new info in the natural entry.
+                natural_entry->key = key;
+                natural_entry->value = value;
+                natural_entry->next_in_chain = blank_index;
+                natural_entry->hash_value = hash_value;
+            }
+            else
+            {
+                // Existing entry does not naturally
+                // belong in this slot.  Existing
+                // entry must be moved.
+
+                // Find natural location of collided element (i.e. root of chain)
+                int	collided_index = natural_entry->hash_value & size_mask;
+                for (int search_count = 0; ; search_count++)
+                {
+                    Entry * e = &entry(collided_index);
+                    if (e->next_in_chain == index)
+                    {
+                        // Here's where we need to splice.
+                        new (blank_entry) Entry(*natural_entry);
+                        e->next_in_chain = blank_index;
+                        break;
+                    }
+                    collided_index = e->next_in_chain;
+                    nvCheck(collided_index >= 0 && collided_index <= size_mask);
+                    nvCheck(search_count <= size_mask);
+                }
+
+                // Put the new data in the natural entry.
+                natural_entry->key = key;
+                natural_entry->value = value;
+                natural_entry->hash_value = hash_value;
+                natural_entry->next_in_chain = -1;
+            }
+        }
+    }
+
+
+    // Remove the first value under the specified key.
+    template<typename T, typename U, typename H, typename E>
+    bool HashMap<T, U, H, E>::remove(const T& key)
+    {
+        if (table == NULL)
+        {
+            return false;
+        }
+
+        int	index = findIndex(key);
+        if (index < 0)
+        {
+            return false;
+        }
+
+        Entry * pos = &entry(index);
+
+        int natural_index = (int) (pos->hash_value & size_mask);
+
+        if (index != natural_index) {
+            // We're not the head of our chain, so we can
+            // be spliced out of it.
+
+            // Iterate up the chain, and splice out when
+            // we get to m_index.
+            Entry* e = &entry(natural_index);
+            while (e->next_in_chain != index) {
+                nvDebugCheck(e->isEndOfChain() == false);
+                e = &entry(e->next_in_chain);
+            }
+
+            if (e->isTombstone() && pos->isEndOfChain()) {
+                // Tombstone has nothing else to point
+                // to, so mark it empty.
+                e->next_in_chain = -2;
+            } else {
+                e->next_in_chain = pos->next_in_chain;
+            }
+
+            pos->clear();
+        }
+        else if (pos->isEndOfChain() == false) {
+            // We're the head of our chain, and there are
+            // additional elements.
+            //
+            // We need to put a tombstone here.
+            //
+            // We can't clear the element, because the
+            // rest of the elements in the chain must be
+            // linked to this position.
+            //
+            // We can't move any of the succeeding
+            // elements in the chain (i.e. to fill this
+            // entry), because we don't want to invalidate
+            // any other existing iterators.
+            pos->makeTombstone();
+        } else {
+            // We're the head of the chain, but we're the
+            // only member of the chain.
+            pos->clear();
+        }
+
+        entry_count--;
+
+        return true;
+    }
+
+
+    // Remove all entries from the hash table.
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::clear()
+    {
+        if (table != NULL)
+        {
+            // Delete the entries.
+            for (int i = 0, n = size_mask; i <= n; i++)
+            {
+                Entry * e = &entry(i);
+                if (e->isEmpty() == false && e->isTombstone() == false)
+                {
+                    e->clear();
+                }
+            }
+            free(table);
+            table = NULL;
+            entry_count = 0;
+            size_mask = -1;
+        }
+    }
+
+
+    // Returns true if the hash is empty.
+    template<typename T, typename U, typename H, typename E>
+    bool HashMap<T, U, H, E>::isEmpty() const
+    {
+        return table == NULL || entry_count == 0;
+    }
+
+
+    // Retrieve the value under the given key.
+    // - If there's no value under the key, then return false and leave *value alone.
+    // - If there is a value, return true, and set *value to the entry's value.
+    // - If value == NULL, return true or false according to the presence of the key, but don't touch *value.
+    template<typename T, typename U, typename H, typename E>
+    bool HashMap<T, U, H, E>::get(const T& key, U* value/*= NULL*/, T* other_key/*= NULL*/) const
+    {
+        int	index = findIndex(key);
+        if (index >= 0)
+        {
+            if (value != NULL) {
+                *value = entry(index).value;	// take care with side-effects!
+            }
+            if (other_key != NULL) {
+                *other_key = entry(index).key;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    // Determine if the given key is contained in the hash.
+    template<typename T, typename U, typename H, typename E>
+    bool HashMap<T, U, H, E>::contains(const T & key) const
+    {
+        return get(key);
+    }
+
+    // Number of entries in the hash.
+    template<typename T, typename U, typename H, typename E>
+    int	HashMap<T, U, H, E>::size() const
+    {
+        return entry_count;
+    }
+
+    // Number of entries in the hash.
+    template<typename T, typename U, typename H, typename E>
+    int	HashMap<T, U, H, E>::count() const
+    {
+        return size();
+    }
+
+    template<typename T, typename U, typename H, typename E>
+    int	HashMap<T, U, H, E>::capacity() const
+    {
+        return size_mask+1;
+    }
+
+
+    // Resize the hash table to fit one more entry.  Often this doesn't involve any action.
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::checkExpand()
+    {
+        if (table == NULL) {
+            // Initial creation of table.  Make a minimum-sized table.
+            setRawCapacity(16);
+        } 
+        else if (entry_count * 3 > (size_mask + 1) * 2) {
+            // Table is more than 2/3rds full.  Expand.
+            setRawCapacity(entry_count * 2);
+        }
+    }
+
+
+    // Hint the bucket count to >= n.
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::resize(int n)
+    {
+        // Not really sure what this means in relation to
+        // STLport's hash_map... they say they "increase the
+        // bucket count to at least n" -- but does that mean
+        // their real capacity after resize(n) is more like
+        // n*2 (since they do linked-list chaining within
+        // buckets?).
+        setCapacity(n);
+    }
+
+
+    // Size the hash so that it can comfortably contain the given number of elements.  If the hash already contains more
+    // elements than new_size, then this may be a no-op.
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::setCapacity(int new_size)
+    {
+        int	new_raw_size = (new_size * 3) / 2;
+        if (new_raw_size < size()) { return; }
+
+        setRawCapacity(new_raw_size);
+    }
+
+
+    // By default we serialize the key-value pairs compactly.
+    template<typename _T, typename _U, typename _H, typename _E>
+    Stream & operator<< (Stream & s, HashMap<_T, _U, _H, _E> & map)
+    {
+        typedef typename HashMap<_T, _U, _H, _E>::Entry HashMapEntry;
+
+        int entry_count = map.entry_count;
+        s << entry_count;
+
+        if (s.isLoading()) {
+            map.clear();
+            if(entry_count == 0) {
+                return s;
+            }
+            map.entry_count = entry_count;
+            map.size_mask = nextPowerOfTwo(U32(entry_count)) - 1;
+            map.table = malloc<HashMapEntry>(map.size_mask + 1);
+
+            for (int i = 0; i <= map.size_mask; i++) {
+                map.table[i].next_in_chain = -2;	// mark empty
+            }
+
+            _T key;
+            _U value;
+            for (int i = 0; i < entry_count; i++) {
+                s << key << value;
+                map.add(key, value);
+            }
+        }
+        else {
+            int i = 0;
+            map.findNext(i);
+            while (i != map.size_mask+1) {
+                HashMapEntry & e = map.entry(i);
+                
+                s << e.key << e.value;
+                
+                i++;
+                map.findNext(i);
+            }
+            //for(HashMap<_T, _U, _H, _E>::PseudoIndex i((map).start()); !(map).isDone(i); (map).advance(i)) {
+            //foreach(i, map) {
+            //    s << map[i].key << map[i].value;
+            //}
+        }
+
+        return s;
+    }
+
+    // This requires more storage, but saves us from rehashing the elements.
+    template<typename _T, typename _U, typename _H, typename _E>
+    Stream & rawSerialize(Stream & s, HashMap<_T, _U, _H, _E> & map)
+    {
+        typedef typename HashMap<_T, _U, _H, _E>::Entry HashMapEntry;
+
+        if (s.isLoading()) {
+            map.clear();
+        }
+
+        s << map.size_mask;
+
+        if (map.size_mask != -1) {
+            s << map.entry_count;
+
+            if (s.isLoading()) {  
+                map.table = new HashMapEntry[map.size_mask+1];
+            }
+
+            for (int i = 0; i <= map.size_mask; i++) {
+                HashMapEntry & e = map.table[i];
+                s << e.next_in_chain << e.hash_value;
+                s << e.key;
+                s << e.value;
+            }
+        }
+
+        return s;
+    }
+
+    // Swap the members of this vector and the given vector.
+    template<typename _T, typename _U, typename _H, typename _E>
+    void swap(HashMap<_T, _U, _H, _E> & a, HashMap<_T, _U, _H, _E> & b)
+    {
+        swap(a.entry_count, b.entry_count);
+        swap(a.size_mask, b.size_mask);
+        swap(a.table, b.table);
+    }
+
+
+    template<typename T, typename U, typename H, typename E>
+    uint HashMap<T, U, H, E>::compute_hash(const T& key) const
+    {
+        H hash;
+        uint hash_value = hash(key);
+        if (hash_value == TOMBSTONE_HASH) {
+            hash_value ^= 0x8000;
+        }
+        return hash_value;
+    }
+
+    // Find the index of the matching entry. If no match, then return -1.
+    template<typename T, typename U, typename H, typename E>
+    int	HashMap<T, U, H, E>::findIndex(const T& key) const
+    {
+        if (table == NULL) return -1;
+
+        E equal;
+
+        uint hash_value = compute_hash(key);
+        int	index = hash_value & size_mask;
+
+        const Entry * e = &entry(index);
+        if (e->isEmpty()) return -1;
+        if (e->isTombstone() == false && int(e->hash_value & size_mask) != index) {
+            // occupied by a collider
+            return -1;
+        }
+
+        for (;;)
+        {
+            nvCheck(e->isTombstone() || (e->hash_value & size_mask) == (hash_value & size_mask));
+
+            if (e->hash_value == hash_value && equal(e->key, key))
+            {
+                // Found it.
+                return index;
+            }
+            nvDebugCheck(e->isTombstone() || !equal(e->key, key));   // keys are equal, but hash differs!
+
+            // Keep looking through the chain.
+            index = e->next_in_chain;
+            if (index == -1) break;	// end of chain
+
+            nvCheck(index >= 0 && index <= size_mask);
+            e = &entry(index);
+
+            nvCheck(e->isEmpty() == false || e->isTombstone());
+        }
+        return -1;
+    }
+
+    // Return the index of the newly cleared element.
+    template<typename T, typename U, typename H, typename E>
+    int HashMap<T, U, H, E>::removeTombstone(int index) {
+        Entry* e = &entry(index);
+        nvCheck(e->isTombstone());
+        nvCheck(!e->isEndOfChain());
+
+        // Move the next element of the chain into the
+        // tombstone slot, and return the vacated element.
+        int new_blank_index = e->next_in_chain;
+        Entry* new_blank = &entry(new_blank_index);
+        new (e) Entry(*new_blank);
+        new_blank->clear();
+        return new_blank_index;
+    }
+
+    // Helpers.
+    template<typename T, typename U, typename H, typename E>
+    typename HashMap<T, U, H, E>::Entry & HashMap<T, U, H, E>::entry(int index)
+    {
+        nvDebugCheck(table != NULL);
+        nvDebugCheck(index >= 0 && index <= size_mask);
+        return table[index];
+    }
+    template<typename T, typename U, typename H, typename E>
+    const typename HashMap<T, U, H, E>::Entry & HashMap<T, U, H, E>::entry(int index) const
+    {
+        nvDebugCheck(table != NULL);
+        nvDebugCheck(index >= 0 && index <= size_mask);
+        return table[index];
+    }
+
+
+    // Resize the hash table to the given size (Rehash the contents of the current table).  The arg is the number of
+    // hash table entries, not the number of elements we should actually contain (which will be less than this).
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::setRawCapacity(int new_size)
+    {
+        if (new_size <= 0) {
+            // Special case.
+            clear();
+            return;
+        }
+
+        // Force new_size to be a power of two.
+        new_size = nextPowerOfTwo(U32(new_size));
+
+        HashMap<T, U, H, E> new_hash;
+        new_hash.table = malloc<Entry>(new_size);
+        nvDebugCheck(new_hash.table != NULL);
+
+        new_hash.entry_count = 0;
+        new_hash.size_mask = new_size - 1;
+        for (int i = 0; i < new_size; i++)
+        {
+            new_hash.entry(i).next_in_chain = -2;	// mark empty
+        }
+
+        // Copy stuff to new_hash
+        if (table != NULL)
+        {
+            for (int i = 0, n = size_mask; i <= n; i++)
+            {
+                Entry * e = &entry(i);
+                if (e->isEmpty() == false && e->isTombstone() == false)
+                {
+                    // Insert old entry into new hash.
+                    new_hash.add(e->key, e->value);
+                    e->clear();	// placement delete of old element
+                }
+            }
+
+            // Delete our old data buffer.
+            free(table);
+        }
+
+        // Steal new_hash's data.
+        entry_count = new_hash.entry_count;
+        size_mask = new_hash.size_mask;
+        table = new_hash.table;
+        new_hash.entry_count = 0;
+        new_hash.size_mask = -1;
+        new_hash.table = NULL;
+    }
+
+    // Move the enumerator to the next valid element.
+    template<typename T, typename U, typename H, typename E>
+    void HashMap<T, U, H, E>::findNext(PseudoIndex & i) const {
+        while (i <= size_mask) {
+            const Entry & e = entry(i);
+            if (e.isEmpty() == false && e.isTombstone() == false) {
+                break;
+            }
+            i++;
+        }
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_HASHMAP_INL
diff --git a/thirdparty/thekla_atlas/nvcore/Memory.cpp b/thirdparty/thekla_atlas/nvcore/Memory.cpp
new file mode 100644
index 0000000000..302a2d84cb
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Memory.cpp
@@ -0,0 +1,153 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "Memory.h"
+#include "Debug.h"
+#include "Utils.h"
+
+#include <stdlib.h>
+
+#ifdef NV_OS_LINUX
+#include <malloc.h>
+#endif
+
+#define USE_EFENCE 0
+
+#if USE_EFENCE
+extern "C" void *EF_malloc(size_t size);
+extern "C" void *EF_realloc(void * oldBuffer, size_t newSize);
+extern "C" void EF_free(void * address);
+#endif
+
+using namespace nv;
+
+#if NV_OVERRIDE_ALLOC
+
+void * malloc(size_t size)
+{
+#if USE_EFENCE
+    return EF_malloc(size);
+#else
+    return ::malloc(size);
+#endif
+}
+
+void * debug_malloc(size_t size, const char * file, int line)
+{
+    NV_UNUSED(file);
+    NV_UNUSED(line);
+#if USE_EFENCE
+    return EF_malloc(size);
+#else
+    return ::malloc(size);
+#endif
+}
+
+void free(void * ptr)
+{
+#if USE_EFENCE
+    return EF_free(const_cast<void *>(ptr));
+#else
+    ::free(const_cast<void *>(ptr));
+#endif
+}
+
+void * realloc(void * ptr, size_t size)
+{
+    nvDebugCheck(ptr != NULL || size != 0); // undefined realloc behavior.
+#if USE_EFENCE
+    return EF_realloc(ptr, size);
+#else
+    return ::realloc(ptr, size);
+#endif
+}
+
+
+/* No need to override this unless we want line info.
+void * operator new (size_t size) throw()
+{
+    return malloc(size);
+}
+
+void operator delete (void *p) throw()
+{
+    free(p);
+}
+
+void * operator new [] (size_t size) throw()
+{
+    return malloc(size);
+}
+
+void operator delete [] (void * p) throw()
+{
+    free(p);
+}
+*/
+
+#if 0 // Code from Apple:
+void* operator new(std::size_t sz) throw (std::bad_alloc)
+{
+        void *result = std::malloc (sz == 0 ? 1 : sz);
+        if (result == NULL)
+                throw std::bad_alloc();
+        gNewCounter++;
+        return result;
+}
+void operator delete(void* p) throw()
+{
+        if (p == NULL)
+                return;
+        std::free (p);
+        gDeleteCounter++;
+}
+
+/* These are the 'nothrow' versions of the above operators.
+   The system version will try to call a std::new_handler if they
+   fail, but your overriding versions are not required to do this.  */
+void* operator new(std::size_t sz, const std::nothrow_t&) throw()
+{
+        try {
+                void * result = ::operator new (sz);  // calls our overridden operator new
+                return result;
+        } catch (std::bad_alloc &) {
+          return NULL;
+        }
+}
+void operator delete(void* p, const std::nothrow_t&) throw()
+{
+        ::operator delete (p);
+}
+
+#endif // 0
+
+#endif // NV_OVERRIDE_ALLOC
+
+void * nv::aligned_malloc(size_t size, size_t alignment)
+{
+    // alignment must be a power of two, multiple of sizeof(void*)
+    nvDebugCheck(isPowerOfTwo(alignment));
+    nvDebugCheck((alignment & (sizeof(void*) - 1)) == 0);
+
+#if NV_OS_WIN32 || NV_OS_DURANGO
+    return _aligned_malloc(size, alignment);
+#elif NV_OS_DARWIN && !NV_OS_IOS
+    void * ptr = NULL;
+    posix_memalign(&ptr, alignment, size);
+    return ptr;
+#elif NV_OS_LINUX
+    return memalign(alignment, size);
+#else // NV_OS_ORBIS || NV_OS_IOS
+    // @@ IC: iOS appears to be 16 byte aligned, should we check alignment and assert if we request a higher alignment factor?
+    return ::malloc(size);
+#endif
+}
+
+void nv::aligned_free(void * ptr)
+{
+#if NV_OS_WIN32 || NV_OS_DURANGO
+    _aligned_free(ptr);
+#else
+    ::free(ptr);
+#endif
+}
+
diff --git a/thirdparty/thekla_atlas/nvcore/Memory.h b/thirdparty/thekla_atlas/nvcore/Memory.h
new file mode 100644
index 0000000000..1f71b60947
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Memory.h
@@ -0,0 +1,72 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_MEMORY_H
+#define NV_CORE_MEMORY_H
+
+#include "nvcore.h"
+
+#include <stdlib.h> // malloc(), realloc() and free()
+#include <string.h> // memset
+//#include <stddef.h> // size_t
+
+//#include <new>	// new and delete
+
+#define TRACK_MEMORY_LEAKS 0
+#if TRACK_MEMORY_LEAKS
+#include <vld.h>
+#endif
+
+
+#if NV_CC_GNUC
+#   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
+#else
+#   define NV_ALIGN_16 __declspec(align(16))
+#endif
+
+
+#define NV_OVERRIDE_ALLOC 0
+
+#if NV_OVERRIDE_ALLOC
+
+// Custom memory allocator
+extern "C" {
+    NVCORE_API void * malloc(size_t size);
+    NVCORE_API void * debug_malloc(size_t size, const char * file, int line);
+    NVCORE_API void free(void * ptr);
+    NVCORE_API void * realloc(void * ptr, size_t size);
+}
+
+/*
+#ifdef _DEBUG
+#define new new(__FILE__, __LINE__)
+#define malloc(i) debug_malloc(i, __FILE__, __LINE__)
+#endif
+*/
+
+#endif
+
+namespace nv {
+    NVCORE_API void * aligned_malloc(size_t size, size_t alignment);
+    NVCORE_API void aligned_free(void * );
+
+    // C++ helpers.
+    template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
+        return (T *)::malloc(sizeof(T) * count);
+    }
+
+    template <typename T> NV_FORCEINLINE T * realloc(T * ptr, size_t count) {
+        return (T *)::realloc(ptr, sizeof(T) * count);
+    }
+
+    template <typename T> NV_FORCEINLINE void free(const T * ptr) {
+        ::free((void *)ptr);
+    }
+
+    template <typename T> NV_FORCEINLINE void zero(T & data) {
+        memset(&data, 0, sizeof(T));
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_MEMORY_H
diff --git a/thirdparty/thekla_atlas/nvcore/Ptr.h b/thirdparty/thekla_atlas/nvcore/Ptr.h
new file mode 100644
index 0000000000..b43039274b
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Ptr.h
@@ -0,0 +1,322 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_PTR_H
+#define NV_CORE_PTR_H
+
+#include "nvcore.h"
+#include "Debug.h"
+
+#include "RefCounted.h"
+
+namespace nv
+{
+    class WeakProxy;
+
+    /** Simple auto pointer template class.
+    *
+    * This is very similar to the standard auto_ptr class, but with some 
+    * additional limitations to make its use less error prone:
+    * - Copy constructor and assignment operator are disabled.
+    * - reset method is removed.
+    * 
+    * The semantics of the standard auto_ptr are not clear and change depending
+    * on the std implementation. For a discussion of the problems of auto_ptr read:
+    * http://www.awprofessional.com/content/images/020163371X/autoptrupdate\auto_ptr_update.html
+    */
+    template <class T>
+    class AutoPtr
+    {
+        NV_FORBID_COPY(AutoPtr);
+        NV_FORBID_HEAPALLOC();
+    public:
+
+        /// Ctor.
+        AutoPtr(T * p = NULL) : m_ptr(p) { }
+
+        template <class Q>
+        AutoPtr(Q * p) : m_ptr(static_cast<T *>(p)) { }
+
+        /// Dtor. Deletes owned pointer.
+        ~AutoPtr() {
+            delete m_ptr;
+            m_ptr = NULL;
+        }
+
+        /// Delete owned pointer and assign new one.
+        void operator=( T * p ) {
+            if (p != m_ptr)
+            {
+                delete m_ptr;
+                m_ptr = p;
+            }
+        }
+
+        template <class Q>
+        void operator=( Q * p ) {
+            if (p != m_ptr)
+            {
+                delete m_ptr;
+                m_ptr = static_cast<T *>(p);
+            }
+        }
+
+        /// Member access.
+        T * operator -> () const {
+            nvDebugCheck(m_ptr != NULL);
+            return m_ptr;
+        }
+
+        /// Get reference.
+        T & operator*() const {
+            nvDebugCheck(m_ptr != NULL);
+            return *m_ptr;
+        }
+
+        /// Get pointer.
+        T * ptr() const { return m_ptr; }
+
+        /// Relinquish ownership of the underlying pointer and returns that pointer.
+        T * release() {
+            T * tmp = m_ptr;
+            m_ptr = NULL;
+            return tmp;
+        }
+
+        /// Const pointer equal comparation.
+        friend bool operator == (const AutoPtr<T> & ap, const T * const p) {
+            return (ap.ptr() == p);
+        }
+
+        /// Const pointer nequal comparation.
+        friend bool operator != (const AutoPtr<T> & ap, const T * const p) {
+            return (ap.ptr() != p);
+        }
+
+        /// Const pointer equal comparation.
+        friend bool operator == (const T * const p, const AutoPtr<T> & ap) {
+            return (ap.ptr() == p);
+        }
+
+        /// Const pointer nequal comparation.
+        friend bool operator != (const T * const p, const AutoPtr<T> & ap) {
+            return (ap.ptr() != p);
+        }
+
+    private:
+        T * m_ptr;
+    };
+
+
+    /// Smart pointer template class.
+    template <class BaseClass>
+    class SmartPtr {
+    public:
+
+        // BaseClass must implement addRef() and release().
+        typedef SmartPtr<BaseClass> ThisType;
+
+        /// Default ctor.
+        SmartPtr() : m_ptr(NULL) 
+        {
+        }
+
+        /// Other type assignment.
+        template <class OtherBase>
+        SmartPtr( const SmartPtr<OtherBase> & tc )
+        {
+            m_ptr = static_cast<BaseClass *>( tc.ptr() );
+            if (m_ptr) {
+                m_ptr->addRef();
+            }
+        }
+
+        /// Copy ctor.
+        SmartPtr( const ThisType & bc )
+        {
+            m_ptr = bc.ptr();
+            if (m_ptr) {
+                m_ptr->addRef();
+            }
+        }
+
+        /// Copy cast ctor. SmartPtr(NULL) is valid.
+        explicit SmartPtr( BaseClass * bc )
+        {
+            m_ptr = bc;
+            if (m_ptr) {
+                m_ptr->addRef();
+            }
+        }
+
+        /// Dtor.
+        ~SmartPtr()
+        {
+            set(NULL);
+        }
+
+
+        /// -> operator.
+        BaseClass * operator -> () const
+        {
+            nvCheck( m_ptr != NULL );
+            return m_ptr;
+        }
+
+        /// * operator.
+        BaseClass & operator*() const
+        {
+            nvCheck( m_ptr != NULL );
+            return *m_ptr;
+        }
+
+        /// Get pointer.
+        BaseClass * ptr() const
+        {
+            return m_ptr;
+        }
+
+        /// Other type assignment.
+        template <class OtherBase>
+        void operator = ( const SmartPtr<OtherBase> & tc )
+        {
+            set( static_cast<BaseClass *>(tc.ptr()) );
+        }
+
+        /// This type assignment.
+        void operator = ( const ThisType & bc )
+        {
+            set( bc.ptr() );
+        }
+
+        /// Pointer assignment.
+        void operator = ( BaseClass * bc )
+        {
+            set( bc );
+        }
+
+
+        /// Other type equal comparation.
+        template <class OtherBase>
+        bool operator == ( const SmartPtr<OtherBase> & other ) const
+        {
+            return m_ptr == other.ptr();
+        }
+
+        /// This type equal comparation.
+        bool operator == ( const ThisType & bc ) const
+        {
+            return m_ptr == bc.ptr();
+        }
+
+        /// Const pointer equal comparation.
+        bool operator == ( const BaseClass * const bc ) const
+        {
+            return m_ptr == bc;
+        }
+
+        /// Other type not equal comparation.
+        template <class OtherBase>
+        bool operator != ( const SmartPtr<OtherBase> & other ) const
+        {
+            return m_ptr != other.ptr();
+        }
+
+        /// Other type not equal comparation.
+        bool operator != ( const ThisType & bc ) const
+        {
+            return m_ptr != bc.ptr();
+        }
+
+        /// Const pointer not equal comparation.
+        bool operator != (const BaseClass * const bc) const
+        {
+            return m_ptr != bc;
+        }
+
+        /// This type lower than comparation.
+        bool operator < (const ThisType & p) const
+        {
+            return m_ptr < p.ptr();
+        }
+
+        bool isValid() const {
+            return isValidPtr(m_ptr);
+        }
+
+    private:
+
+        // Set this pointer.
+        void set( BaseClass * p )
+        {
+            if (p) p->addRef();
+            if (m_ptr) m_ptr->release();
+            m_ptr = p;
+        }
+
+    private:
+
+        BaseClass * m_ptr;
+
+    };
+
+
+    /// Smart pointer template class.
+    template <class T>
+    class WeakPtr {
+    public:
+
+        WeakPtr() {}
+
+        WeakPtr(T * p)  { operator=(p); }
+        WeakPtr(const SmartPtr<T> & p) { operator=(p.ptr()); }
+
+        // Default constructor and assignment from weak_ptr<T> are OK.
+
+        void operator=(T * p)
+        {
+            if (p) {
+                m_proxy = p->getWeakProxy();
+                nvDebugCheck(m_proxy != NULL);
+                nvDebugCheck(m_proxy->ptr() == p);
+            }
+            else {
+                m_proxy = NULL;
+            }
+        }
+
+        void operator=(const SmartPtr<T> & ptr) { operator=(ptr.ptr()); }
+
+        bool operator==(const SmartPtr<T> & p) const { return ptr() == p.ptr(); }
+        bool operator!=(const SmartPtr<T> & p) const { return ptr() != p.ptr(); }
+
+        bool operator==(const WeakPtr<T> & p) const { return ptr() == p.ptr(); }
+        bool operator!=(const WeakPtr<T> & p) const { return ptr() != p.ptr(); }
+
+        bool operator==(T * p) const { return ptr() == p; }
+        bool operator!=(T * p) const { return ptr() != p; }
+
+        T * operator->() const
+        {
+            T * p = ptr();
+            nvDebugCheck(p != NULL);
+            return p;
+        }
+
+        T * ptr() const
+        {
+            if (m_proxy != NULL) {
+                return static_cast<T *>(m_proxy->ptr());
+            }
+            return NULL;
+        }
+
+    private:
+
+        mutable SmartPtr<WeakProxy> m_proxy;
+
+    };
+
+
+} // nv namespace
+
+#endif // NV_CORE_PTR_H
diff --git a/thirdparty/thekla_atlas/nvcore/RadixSort.cpp b/thirdparty/thekla_atlas/nvcore/RadixSort.cpp
new file mode 100644
index 0000000000..3f44620c99
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/RadixSort.cpp
@@ -0,0 +1,285 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "RadixSort.h"
+
+#include "Utils.h"
+
+#include <string.h> // memset
+
+using namespace nv;
+
+static inline void FloatFlip(uint32 & f) {
+    //uint32 mask = -int32(f >> 31) | 0x80000000; // Michael Herf.
+    int32 mask = (int32(f) >> 31) | 0x80000000; // Warren Hunt, Manchor Ko.
+    f ^= mask;
+}
+
+static inline void IFloatFlip(uint32 & f) {
+    uint32 mask = ((f >> 31) - 1) | 0x80000000; // Michael Herf.
+    //uint32 mask = (int32(f ^ 0x80000000) >> 31) | 0x80000000; // Warren Hunt, Manchor Ko. @@ Correct, but fails in release on gcc-4.2.1
+    f ^= mask;
+}
+
+
+template<typename T> 
+void createHistograms(const T * buffer, uint count, uint * histogram)
+{
+    const uint bucketCount = sizeof(T); // (8 * sizeof(T)) / log2(radix)
+
+    // Init bucket pointers.
+    uint * h[bucketCount];
+    for (uint i = 0; i < bucketCount; i++) {
+#if NV_BIG_ENDIAN
+        h[sizeof(T)-1-i] = histogram + 256 * i;
+#else
+        h[i] = histogram + 256 * i;
+#endif
+    }
+
+    // Clear histograms.
+    memset(histogram, 0, 256 * bucketCount * sizeof(uint));
+
+    // @@ Add support for signed integers.
+
+    // Build histograms.
+    const uint8 * p = (const uint8 *)buffer; // @@ Does this break aliasing rules?
+    const uint8 * pe = p + count * sizeof(T);
+
+    while (p != pe) {
+        h[0][*p++]++, h[1][*p++]++, h[2][*p++]++, h[3][*p++]++;
+        if (bucketCount == 8) h[4][*p++]++, h[5][*p++]++, h[6][*p++]++, h[7][*p++]++;
+    }
+}
+
+/*
+template <>
+void createHistograms<float>(const float * buffer, uint count, uint * histogram)
+{
+    // Init bucket pointers.
+    uint32 * h[4];
+    for (uint i = 0; i < 4; i++) {
+#if NV_BIG_ENDIAN
+        h[3-i] = histogram + 256 * i;
+#else
+        h[i] = histogram + 256 * i;
+#endif
+    }
+
+    // Clear histograms.
+    memset(histogram, 0, 256 * 4 * sizeof(uint32));
+
+    // Build histograms.
+    for (uint i = 0; i < count; i++) {
+        uint32 fi = FloatFlip(buffer[i]);
+
+        h[0][fi & 0xFF]++;
+        h[1][(fi >> 8) & 0xFF]++;
+        h[2][(fi >> 16) & 0xFF]++;
+        h[3][fi >> 24]++;
+    }
+}
+*/
+
+RadixSort::RadixSort() : m_size(0), m_ranks(NULL), m_ranks2(NULL), m_validRanks(false)
+{
+}
+
+RadixSort::RadixSort(uint reserve_count) : m_size(0), m_ranks(NULL), m_ranks2(NULL), m_validRanks(false)
+{
+    checkResize(reserve_count);
+}
+
+RadixSort::~RadixSort()
+{
+    // Release everything
+    free(m_ranks2);
+    free(m_ranks);
+}
+
+
+void RadixSort::resize(uint count)
+{
+    m_ranks2 = realloc<uint>(m_ranks2, count);
+    m_ranks = realloc<uint>(m_ranks, count);
+}
+
+inline void RadixSort::checkResize(uint count)
+{
+    if (count != m_size)
+    {
+        if (count > m_size) resize(count);
+        m_size = count;
+        m_validRanks = false;
+    }
+}
+
+template <typename T> inline void RadixSort::insertionSort(const T * input, uint count)
+{
+    if (!m_validRanks) {
+        /*for (uint i = 0; i < count; i++) {
+            m_ranks[i] = i;
+        }*/
+
+        m_ranks[0] = 0;
+        for (uint i = 1; i != count; ++i)
+        {
+            int rank = m_ranks[i] = i;
+
+            uint j = i;
+            while (j != 0 && input[rank] < input[m_ranks[j-1]])
+            {
+                m_ranks[j] = m_ranks[j-1];
+                --j;
+            }
+            if (i != j)
+            {
+                m_ranks[j] = rank;
+            }
+        }
+
+        m_validRanks = true;
+    }
+    else {
+        for (uint i = 1; i != count; ++i)
+        {
+            int rank = m_ranks[i];
+
+            uint j = i;
+            while (j != 0 && input[rank] < input[m_ranks[j-1]])
+            {
+                m_ranks[j] = m_ranks[j-1];
+                --j;
+            }
+            if (i != j)
+            {
+                m_ranks[j] = rank;
+            }
+        }
+    }
+}
+
+template <typename T> inline void RadixSort::radixSort(const T * input, uint count)
+{
+    const uint P = sizeof(T); // pass count
+
+    // Allocate histograms & offsets on the stack
+    uint histogram[256 * P];
+    uint * link[256];
+
+    createHistograms(input, count, histogram);
+
+    // Radix sort, j is the pass number (0=LSB, P=MSB)
+    for (uint j = 0; j < P; j++)
+    {
+        // Pointer to this bucket.
+        const uint * h = &histogram[j * 256];
+
+        const uint8 * inputBytes = (const uint8*)input; // @@ Is this aliasing legal?
+
+#if NV_BIG_ENDIAN
+        inputBytes += P - 1 - j;
+#else
+        inputBytes += j;
+#endif
+
+        if (h[inputBytes[0]] == count) {
+            // Skip this pass, all values are the same.
+            continue;
+        }
+
+        // Create offsets
+        link[0] = m_ranks2;
+        for (uint i = 1; i < 256; i++) link[i] = link[i-1] + h[i-1];
+
+        // Perform Radix Sort
+        if (!m_validRanks)
+        {
+            for (uint i = 0; i < count; i++)
+            {
+                *link[inputBytes[i*P]]++ = i;
+            }
+            m_validRanks = true;
+        }
+        else
+        {
+            for (uint i = 0; i < count; i++)
+            {
+                const uint idx = m_ranks[i];
+                *link[inputBytes[idx*P]]++ = idx;
+            }
+        }
+
+        // Swap pointers for next pass. Valid indices - the most recent ones - are in m_ranks after the swap.
+        swap(m_ranks, m_ranks2);
+    }
+
+    // All values were equal, generate linear ranks.
+    if (!m_validRanks)
+    {
+        for (uint i = 0; i < count; i++)
+        {
+            m_ranks[i] = i;
+        }
+        m_validRanks = true;
+    }
+}
+
+
+RadixSort & RadixSort::sort(const uint32 * input, uint count)
+{
+    if (input == NULL || count == 0) return *this;
+
+    // Resize lists if needed
+    checkResize(count);
+
+    if (count < 32) {
+        insertionSort(input, count);
+    }
+    else {
+        radixSort<uint32>(input, count);
+    }
+    return *this;
+}
+
+
+RadixSort & RadixSort::sort(const uint64 * input, uint count)
+{
+    if (input == NULL || count == 0) return *this;
+
+    // Resize lists if needed
+    checkResize(count);
+
+    if (count < 64) {
+        insertionSort(input, count);
+    }
+    else {
+        radixSort(input, count);
+    }
+    return *this;
+}
+
+RadixSort& RadixSort::sort(const float * input, uint count)
+{
+    if (input == NULL || count == 0) return *this;
+
+    // Resize lists if needed
+    checkResize(count);
+
+    if (count < 32) {
+        insertionSort(input, count);
+    }
+    else {
+        // @@ Avoid touching the input multiple times.
+        for (uint i = 0; i < count; i++) {
+            FloatFlip((uint32 &)input[i]);
+        }
+
+        radixSort<uint32>((const uint32 *)input, count);
+
+        for (uint i = 0; i < count; i++) {
+            IFloatFlip((uint32 &)input[i]);
+        }
+    }
+
+    return *this;
+}
diff --git a/thirdparty/thekla_atlas/nvcore/RadixSort.h b/thirdparty/thekla_atlas/nvcore/RadixSort.h
new file mode 100644
index 0000000000..82325ebb24
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/RadixSort.h
@@ -0,0 +1,75 @@
+#pragma once
+#ifndef NV_CORE_RADIXSORT_H
+#define NV_CORE_RADIXSORT_H
+
+// Based on Pierre Terdiman's and Michael Herf's source code.
+// http://www.codercorner.com/RadixSortRevisited.htm
+// http://www.stereopsis.com/radix.html
+
+#include "nvcore.h"
+#include "Array.h"
+
+namespace nv
+{
+
+    class NVCORE_CLASS RadixSort
+    {
+        NV_FORBID_COPY(RadixSort);
+    public:
+        // Constructor/Destructor
+        RadixSort();
+        RadixSort(uint reserve_count);
+        ~RadixSort();
+
+        // Invalidate ranks.
+        RadixSort & reset() { m_validRanks = false; return *this; }
+
+        // Sorting methods.
+        RadixSort & sort(const uint32 * input, uint count);
+        RadixSort & sort(const uint64 * input, uint count);
+        RadixSort & sort(const float * input, uint count);
+
+        // Helpers.
+        RadixSort & sort(const Array<uint32> & input);
+        RadixSort & sort(const Array<uint64> & input);
+        RadixSort & sort(const Array<float> & input);
+
+        // Access to results. m_ranks is a list of indices in sorted order, i.e. in the order you may further process your data
+        inline const uint * ranks() const { nvDebugCheck(m_validRanks); return m_ranks; }
+        inline uint * ranks() { nvDebugCheck(m_validRanks); return m_ranks; }
+        inline uint rank(uint i) const { nvDebugCheck(m_validRanks); return m_ranks[i]; }
+
+        // query whether the sort has been performed
+        inline bool valid() const { return m_validRanks; }
+
+    private:
+        uint m_size;
+        uint * m_ranks;
+        uint * m_ranks2;
+        bool m_validRanks;
+
+        // Internal methods
+        template <typename T> void insertionSort(const T * input, uint count);
+        template <typename T> void radixSort(const T * input, uint count);
+
+        void checkResize(uint nb);
+        void resize(uint nb);
+    };
+
+    inline RadixSort & RadixSort::sort(const Array<uint32> & input) {
+        return sort(input.buffer(), input.count());
+    }
+
+    inline RadixSort & RadixSort::sort(const Array<uint64> & input) {
+        return sort(input.buffer(), input.count());
+    }
+
+    inline RadixSort & RadixSort::sort(const Array<float> & input) {
+        return sort(input.buffer(), input.count());
+    }
+
+} // nv namespace
+
+
+
+#endif // NV_CORE_RADIXSORT_H
diff --git a/thirdparty/thekla_atlas/nvcore/RefCounted.h b/thirdparty/thekla_atlas/nvcore/RefCounted.h
new file mode 100644
index 0000000000..b8d68edee3
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/RefCounted.h
@@ -0,0 +1,149 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_REFCOUNTED_H
+#define NV_CORE_REFCOUNTED_H
+
+#include "nvcore.h"
+#include "Debug.h"
+
+#define NV_DECLARE_PTR(Class) \
+    template <class T> class SmartPtr; \
+    typedef SmartPtr<class Class> Class ## Ptr; \
+    typedef SmartPtr<const class Class> Class ## ConstPtr
+
+
+namespace nv
+{
+    /// Weak proxy.
+    class WeakProxy
+    {
+        NV_FORBID_COPY(WeakProxy);
+    public:
+	    /// Ctor.
+	    WeakProxy(void * ptr) : m_count(0), m_ptr(ptr) { }
+
+        /// Dtor.
+        ~WeakProxy()
+        {
+            nvCheck( m_count == 0 );
+        }
+
+        /// Increase reference count.
+        uint addRef() const
+        {
+            m_count++;
+            return m_count;
+        }
+
+        /// Decrease reference count and remove when 0.
+        uint release() const
+        {
+            nvCheck( m_count > 0 );
+
+            m_count--;
+            if( m_count == 0 ) {
+                delete this;
+                return 0;
+            }
+            return m_count;
+        }
+
+	    /// WeakPtr's call this to determine if their pointer is valid or not.
+	    bool isAlive() const {
+		    return m_ptr != NULL;
+	    }
+
+	    /// Only the actual object should call this.
+	    void notifyObjectDied() {
+		    m_ptr = NULL;
+	    }
+
+        /// Return proxy pointer.
+        void * ptr() const {
+            return m_ptr;
+        }
+
+    private:
+        mutable int m_count;
+	    void * m_ptr;
+    };
+
+
+    /// Reference counted base class to be used with SmartPtr and WeakPtr.
+    class RefCounted
+    {
+        NV_FORBID_COPY(RefCounted);
+    public:
+
+        /// Ctor.
+        RefCounted() : m_count(0), m_weak_proxy(NULL)
+        {
+        }
+
+        /// Virtual dtor.
+        virtual ~RefCounted()
+        {
+            nvCheck( m_count == 0 );
+            releaseWeakProxy();
+        }
+
+
+        /// Increase reference count.
+        uint addRef() const
+        {
+            m_count++;
+            return m_count;
+        }
+
+
+        /// Decrease reference count and remove when 0.
+        uint release() const
+        {
+            nvCheck( m_count > 0 );
+
+            m_count--;
+            if( m_count == 0 ) {
+                delete this;
+                return 0;
+            }
+            return m_count;
+        }
+
+        /// Get weak proxy.
+        WeakProxy * getWeakProxy() const
+        {
+            if (m_weak_proxy == NULL) {
+                m_weak_proxy = new WeakProxy((void *)this);
+                m_weak_proxy->addRef();
+            }
+            return m_weak_proxy;
+        }
+
+        /// Release the weak proxy.	
+        void releaseWeakProxy() const
+        {
+            if (m_weak_proxy != NULL) {
+                m_weak_proxy->notifyObjectDied();
+                m_weak_proxy->release();
+                m_weak_proxy = NULL;
+            }
+        }
+
+        /// Get reference count.
+        int refCount() const
+        {
+            return m_count;
+        }
+
+
+    private:
+
+        mutable int m_count;
+        mutable WeakProxy * m_weak_proxy;
+
+    };
+
+} // nv namespace
+
+
+#endif // NV_CORE_REFCOUNTED_H
diff --git a/thirdparty/thekla_atlas/nvcore/StdStream.h b/thirdparty/thekla_atlas/nvcore/StdStream.h
new file mode 100644
index 0000000000..f65d6dab59
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/StdStream.h
@@ -0,0 +1,474 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+//#pragma once
+//#ifndef NV_CORE_STDSTREAM_H
+//#define NV_CORE_STDSTREAM_H
+
+#include "nvcore.h"
+#include "Stream.h"
+#include "Array.h"
+
+#include <stdio.h> // fopen
+#include <string.h> // memcpy
+
+namespace nv
+{
+
+    // Portable version of fopen.
+    inline FILE * fileOpen(const char * fileName, const char * mode)
+    {
+        nvCheck(fileName != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+        FILE * fp;
+        if (fopen_s(&fp, fileName, mode) == 0) {
+            return fp;
+        }
+        return NULL;
+#else
+        return fopen(fileName, mode);
+#endif
+    }
+
+
+    /// Base stdio stream.
+    class NVCORE_CLASS StdStream : public Stream
+    {
+        NV_FORBID_COPY(StdStream);
+    public:
+
+        /// Ctor.
+        StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { }
+
+        /// Dtor. 
+        virtual ~StdStream()
+        {
+            if( m_fp != NULL && m_autoclose ) {
+#if NV_OS_WIN32
+                _fclose_nolock( m_fp );
+#else
+                fclose( m_fp );
+#endif
+            }
+        }
+
+
+        /** @name Stream implementation. */
+        //@{
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(m_fp != NULL);
+            nvDebugCheck(pos <= size());
+#if NV_OS_WIN32
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return _ftell_nolock(m_fp);
+#else
+            return (uint)ftell(m_fp);
+#endif
+        }
+
+        virtual uint size() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return end;
+        }
+
+        virtual bool isError() const
+        {
+            return m_fp == NULL || ferror( m_fp ) != 0;
+        }
+
+        virtual void clearError()
+        {
+            nvDebugCheck(m_fp != NULL);
+            clearerr(m_fp);
+        }
+
+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
+        // implementation uses use ftell and fseek to determine our location within the file.
+        virtual bool isAtEnd() const
+        {
+            if (m_fp == NULL) return true;
+            //nvDebugCheck(m_fp != NULL);
+            //return feof( m_fp ) != 0;
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return pos == end;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const { return true; }
+        //@}
+
+    protected:
+
+        FILE * m_fp;
+        bool m_autoclose;
+
+    };
+
+
+    /// Standard output stream.
+    class NVCORE_CLASS StdOutputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdOutputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Write data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fwrite_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fwrite_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                putc_unlocked(((char *)data)[i], m_fp);
+            }
+            return len;
+#else
+            return (uint)fwrite(data, 1, len, m_fp);
+#endif
+        }
+
+        virtual bool isLoading() const
+        {
+            return false;
+        }
+
+        virtual bool isSaving() const
+        {
+            return true;
+        }
+        //@}
+
+    };
+
+
+    /// Standard input stream.
+    class NVCORE_CLASS StdInputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdInputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fread_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fread_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // This is rather lame. Not sure if it's faster than the locked version.
+            for (uint i = 0; i < len; i++) {
+                ((char *)data)[i] = getc_unlocked(m_fp);
+                if (feof_unlocked(m_fp) != 0) {
+                    return i;
+                }
+            }
+            return len;
+#else
+            return (uint)fread(data, 1, len, m_fp);
+#endif
+            
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+    };
+
+
+
+    /// Memory input stream.
+    class NVCORE_CLASS MemoryInputStream : public Stream
+    {
+        NV_FORBID_COPY(MemoryInputStream);
+    public:
+
+        /// Ctor.
+        MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(!isError());
+
+            uint left = m_size - tell();
+            if (len > left) len = left;
+
+            memcpy( data, m_ptr, len );
+            m_ptr += len;
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(!isError());
+            m_ptr = m_mem + pos;
+            nvDebugCheck(!isError());
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_ptr >= m_mem);
+            return uint(m_ptr - m_mem);
+        }
+
+        virtual uint size() const
+        {
+            return m_size;
+        }
+
+        virtual bool isError() const
+        {
+            return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
+        }
+
+        virtual void clearError()
+        {
+            // Nothing to do.
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_ptr == m_mem + m_size;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const
+        {
+            return true;
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+
+        const uint8 * ptr() const { return m_ptr; }
+
+
+    private:
+
+        const uint8 * m_mem;
+        const uint8 * m_ptr;
+        uint m_size;
+
+    };
+
+
+    /// Buffer output stream.
+    class NVCORE_CLASS BufferOutputStream : public Stream
+    {
+        NV_FORBID_COPY(BufferOutputStream);
+    public:
+
+        BufferOutputStream(Array<uint8> & buffer) : m_buffer(buffer) { }
+
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            m_buffer.append((uint8 *)data, len);
+            return len;
+        }
+
+        virtual void seek( uint /*pos*/ ) { /*Not implemented*/ }
+        virtual uint tell() const { return m_buffer.size(); }
+        virtual uint size() const { return m_buffer.size(); }
+
+        virtual bool isError() const { return false; }
+        virtual void clearError() {}
+
+        virtual bool isAtEnd() const { return true; }
+        virtual bool isSeekable() const { return false; }
+        virtual bool isLoading() const { return false; }
+        virtual bool isSaving() const { return true; }
+
+    private:
+        Array<uint8> & m_buffer;
+    };
+
+
+    /// Protected input stream.
+    class NVCORE_CLASS ProtectedStream : public Stream
+    {
+        NV_FORBID_COPY(ProtectedStream);
+    public:
+
+        /// Ctor.
+        ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
+        { 
+        }
+
+        /// Ctor.
+        ProtectedStream( Stream * s, bool autodelete = true ) : 
+        m_s(s), m_autodelete(autodelete) 
+        {
+            nvDebugCheck(m_s != NULL);
+        }
+
+        /// Dtor.
+        virtual ~ProtectedStream()
+        {
+            if( m_autodelete ) {
+                delete m_s;
+            }
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            len = m_s->serialize( data, len );
+
+            if( m_s->isError() ) {
+#if NV_OS_ORBIS
+                //SBtodoORBIS disabled (no exceptions)
+#else
+                throw;
+#endif
+            }
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            m_s->seek( pos );
+
+            if( m_s->isError() ) {
+#if NV_OS_ORBIS
+                //SBtodoORBIS disabled (no exceptions)
+#else
+                throw;
+#endif
+            }
+        }
+
+        virtual uint tell() const
+        {
+            return m_s->tell();
+        }
+
+        virtual uint size() const
+        {
+            return m_s->size();
+        }
+
+        virtual bool isError() const
+        {
+            return m_s->isError();
+        }
+
+        virtual void clearError()
+        {
+            m_s->clearError();
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_s->isAtEnd();
+        }
+
+        virtual bool isSeekable() const
+        {
+            return m_s->isSeekable();
+        }
+
+        virtual bool isLoading() const
+        {
+            return m_s->isLoading();
+        }
+
+        virtual bool isSaving() const
+        {
+            return m_s->isSaving();
+        }
+        //@}
+
+
+    private:
+
+        Stream * const m_s;
+        bool const m_autodelete;
+
+    };
+
+} // nv namespace
+
+
+//#endif // NV_CORE_STDSTREAM_H
diff --git a/thirdparty/thekla_atlas/nvcore/StrLib.cpp b/thirdparty/thekla_atlas/nvcore/StrLib.cpp
new file mode 100644
index 0000000000..7ec6c70136
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/StrLib.cpp
@@ -0,0 +1,796 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "StrLib.h"
+
+#include "Memory.h"
+#include "Utils.h" // swap
+
+#include <math.h>   // log
+#include <stdio.h>  // vsnprintf
+#include <string.h> // strlen, strcmp, etc.
+
+#if NV_CC_MSVC
+#include <stdarg.h> // vsnprintf
+#endif
+
+using namespace nv;
+
+namespace 
+{
+    static char * strAlloc(uint size)
+    {
+        return malloc<char>(size);
+    }
+
+    static char * strReAlloc(char * str, uint size)
+    {
+        return realloc<char>(str, size);
+    }
+
+    static void strFree(const char * str)
+    {
+        return free<char>(str);
+    }
+
+    /*static char * strDup( const char * str )
+    {
+        nvDebugCheck( str != NULL );
+        uint len = uint(strlen( str ) + 1);
+        char * dup = strAlloc( len );
+        memcpy( dup, str, len );
+        return dup;
+    }*/
+
+    // helper function for integer to string conversion.
+    static char * i2a( uint i, char *a, uint r )
+    {
+        if( i / r > 0 ) {
+            a = i2a( i / r, a, r );
+        }
+        *a = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % r];
+        return a + 1;
+    }
+
+    // Locale independent functions.
+    static inline char toUpper( char c ) {
+        return (c<'a' || c>'z') ? (c) : (c+'A'-'a');
+    }
+    static inline char toLower( char c ) {
+        return (c<'A' || c>'Z') ? (c) : (c+'a'-'A');
+    }
+    static inline bool isAlpha( char c ) {
+        return (c>='a' && c<='z') || (c>='A' && c<='Z');
+    }
+    static inline bool isDigit( char c ) {
+        return c>='0' && c<='9';
+    }
+    static inline bool isAlnum( char c ) {
+        return (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9');
+    }
+
+}
+
+uint nv::strLen(const char * str)
+{
+    nvDebugCheck(str != NULL);
+    return U32(strlen(str));
+}
+
+int nv::strDiff(const char * s1, const char * s2)
+{
+    nvDebugCheck(s1 != NULL);
+    nvDebugCheck(s2 != NULL);
+    return strcmp(s1, s2);
+}
+
+int nv::strCaseDiff(const char * s1, const char * s2)
+{
+    nvDebugCheck(s1 != NULL);
+    nvDebugCheck(s1 != NULL);
+#if NV_CC_MSVC
+    return _stricmp(s1, s2);
+#else
+    return strcasecmp(s1, s2);
+#endif
+}
+
+bool nv::strEqual(const char * s1, const char * s2)
+{
+    if (s1 == s2) return true;
+    if (s1 == NULL || s2 == NULL) return false;
+    return strcmp(s1, s2) == 0;
+}
+
+bool nv::strCaseEqual(const char * s1, const char * s2)
+{
+    if (s1 == s2) return true;
+    if (s1 == NULL || s2 == NULL) return false;
+    return strCaseDiff(s1, s2) == 0;
+}
+
+bool nv::strBeginsWith(const char * str, const char * prefix)
+{
+    //return strstr(str, prefix) == dst;
+    return strncmp(str, prefix, strlen(prefix)) == 0;
+}
+
+bool nv::strEndsWith(const char * str, const char * suffix)
+{
+    uint ml = strLen(str);
+    uint sl = strLen(suffix);
+    if (ml < sl) return false;
+    return strncmp(str + ml - sl, suffix, sl) == 0;
+}
+
+// @@ Add asserts to detect overlap between dst and src?
+void nv::strCpy(char * dst, uint size, const char * src)
+{
+    nvDebugCheck(dst != NULL);
+    nvDebugCheck(src != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+    strcpy_s(dst, size, src);
+#else
+    NV_UNUSED(size);
+    strcpy(dst, src);
+#endif
+}
+
+void nv::strCpy(char * dst, uint size, const char * src, uint len)
+{
+    nvDebugCheck(dst != NULL);
+    nvDebugCheck(src != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+    strncpy_s(dst, size, src, len);
+#else
+    int n = min(len+1, size);
+    strncpy(dst, src, n);
+    dst[n-1] = '\0';
+#endif
+}
+
+void nv::strCat(char * dst, uint size, const char * src)
+{
+    nvDebugCheck(dst != NULL);
+    nvDebugCheck(src != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+    strcat_s(dst, size, src);
+#else
+    NV_UNUSED(size);
+    strcat(dst, src);
+#endif
+}
+
+NVCORE_API const char * nv::strSkipWhiteSpace(const char * str)
+{
+    nvDebugCheck(str != NULL);
+    while (*str == ' ') str++;
+    return str;
+}
+
+NVCORE_API char * nv::strSkipWhiteSpace(char * str)
+{
+    nvDebugCheck(str != NULL);
+    while (*str == ' ') str++;
+    return str;
+}
+
+
+/** Pattern matching routine. I don't remember where did I get this. */
+bool nv::strMatch(const char * str, const char * pat)
+{
+    nvDebugCheck(str != NULL);
+    nvDebugCheck(pat != NULL);
+
+    char c2;
+
+    while (true) {
+        if (*pat==0) {
+            if (*str==0) return true;
+            else         return false;
+        }
+        if ((*str==0) && (*pat!='*')) return false;
+        if (*pat=='*') {
+            pat++;
+            if (*pat==0) return true;
+            while (true) {
+                if (strMatch(str, pat)) return true;
+                if (*str==0) return false;
+                str++;
+            }
+        }
+        if (*pat=='?') goto match;
+        if (*pat=='[') {
+            pat++;
+            while (true) {
+                if ((*pat==']') || (*pat==0)) return false;
+                if (*pat==*str) break;
+                if (pat[1] == '-') {
+                    c2 = pat[2];
+                    if (c2==0) return false;
+                    if ((*pat<=*str) && (c2>=*str)) break;
+                    if ((*pat>=*str) && (c2<=*str)) break;
+                    pat+=2;
+                }
+                pat++;
+            }
+            while (*pat!=']') {
+                if (*pat==0) {
+                    pat--;
+                    break;
+                }
+                pat++;
+            }
+            goto match;
+        }
+
+        if (*pat == NV_PATH_SEPARATOR) {
+            pat++;
+            if (*pat==0) return false;
+        }
+        if (*pat!=*str) return false;
+
+match:
+        pat++;
+        str++;
+    }
+}
+
+bool nv::isNumber(const char * str) {
+    while(*str != '\0') {
+        if (!isDigit(*str)) return false;
+        str++;
+    }
+    return true;
+}
+
+
+/** Empty string. */
+StringBuilder::StringBuilder() : m_size(0), m_str(NULL)
+{
+}
+
+/** Preallocate space. */
+StringBuilder::StringBuilder( uint size_hint ) : m_size(size_hint)
+{
+    nvDebugCheck(m_size > 0);
+    m_str = strAlloc(m_size);
+    *m_str = '\0';
+}
+
+/** Copy ctor. */
+StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
+{
+    copy(s);
+}
+
+/** Copy string. */
+StringBuilder::StringBuilder(const char * s) : m_size(0), m_str(NULL)
+{
+    if (s != NULL) {
+        copy(s);
+    }
+}
+
+/** Copy string. */
+StringBuilder::StringBuilder(const char * s, uint len) : m_size(0), m_str(NULL)
+{
+    copy(s, len);
+}
+
+/** Delete the string. */
+StringBuilder::~StringBuilder()
+{
+    strFree(m_str);
+}
+
+
+/** Format a string safely. */
+StringBuilder & StringBuilder::format( const char * fmt, ... )
+{
+    nvDebugCheck(fmt != NULL);
+    va_list arg;
+    va_start( arg, fmt );
+
+    formatList( fmt, arg );
+
+    va_end( arg );
+
+    return *this;
+}
+
+
+/** Format a string safely. */
+StringBuilder & StringBuilder::formatList( const char * fmt, va_list arg )
+{
+    nvDebugCheck(fmt != NULL);
+
+    if (m_size == 0) {
+        m_size = 64;
+        m_str = strAlloc( m_size );
+    }
+
+    va_list tmp;
+    va_copy(tmp, arg);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+    int n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
+#else
+    int n = vsnprintf(m_str, m_size, fmt, tmp);
+#endif
+    va_end(tmp);
+
+    while( n < 0 || n >= int(m_size) ) {
+        if( n > -1 ) {
+            m_size = n + 1;
+        }
+        else {
+            m_size *= 2;
+        }
+
+        m_str = strReAlloc(m_str, m_size);
+
+        va_copy(tmp, arg);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+        n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
+#else
+        n = vsnprintf(m_str, m_size, fmt, tmp);
+#endif
+        va_end(tmp);
+    }
+
+    nvDebugCheck(n < int(m_size));
+
+    // Make sure it's null terminated.
+    nvDebugCheck(m_str[n] == '\0');
+    //str[n] = '\0';
+
+    return *this;
+}
+
+
+// Append a character.
+StringBuilder & StringBuilder::append( char c )
+{
+    return append(&c, 1);
+}
+
+// Append a string.
+StringBuilder & StringBuilder::append( const char * s )
+{
+    return append(s, U32(strlen( s )));
+}
+
+// Append a string.
+StringBuilder & StringBuilder::append(const char * s, uint len)
+{
+    nvDebugCheck(s != NULL);
+
+    uint offset = length();
+    const uint size = offset + len + 1;
+    reserve(size);
+    strCpy(m_str + offset, len + 1, s, len);
+
+    return *this;
+}
+
+StringBuilder & StringBuilder::append(const StringBuilder & str)
+{
+    return append(str.m_str, str.length());
+}
+
+
+/** Append a formatted string. */
+StringBuilder & StringBuilder::appendFormat( const char * fmt, ... )
+{
+    nvDebugCheck( fmt != NULL );
+
+    va_list arg;
+    va_start( arg, fmt );
+
+    appendFormatList( fmt, arg );
+
+    va_end( arg );
+
+    return *this;
+}
+
+
+/** Append a formatted string. */
+StringBuilder & StringBuilder::appendFormatList( const char * fmt, va_list arg )
+{
+    nvDebugCheck( fmt != NULL );
+
+    va_list tmp;
+    va_copy(tmp, arg);
+
+    if (m_size == 0) {
+        formatList(fmt, arg);
+    }
+    else {
+        StringBuilder tmp_str;
+        tmp_str.formatList( fmt, tmp );
+        append( tmp_str.str() );
+    }
+
+    va_end(tmp);
+
+    return *this;
+}
+
+// Append n spaces.
+StringBuilder & StringBuilder::appendSpace(uint n)
+{
+    if (m_str == NULL) {
+        m_size = n + 1;
+        m_str = strAlloc(m_size);
+        memset(m_str, ' ', m_size);
+        m_str[n] = '\0';
+    }
+    else {
+        const uint len = strLen(m_str);
+        if (m_size < len + n + 1) {
+            m_size = len + n + 1;
+            m_str = strReAlloc(m_str, m_size);
+        }
+        memset(m_str + len, ' ', n);
+        m_str[len+n] = '\0';
+    }
+
+    return *this;
+}
+
+
+/** Convert number to string in the given base. */
+StringBuilder & StringBuilder::number( int i, int base )
+{
+    nvCheck( base >= 2 );
+    nvCheck( base <= 36 );
+
+    // @@ This needs to be done correctly.
+    // length = floor(log(i, base));
+    uint len = uint(log(float(i)) / log(float(base)) + 2); // one more if negative
+    reserve(len);
+
+    if( i < 0 ) {
+        *m_str = '-';
+        *i2a(uint(-i), m_str+1, base) = 0;
+    }
+    else {
+        *i2a(i, m_str, base) = 0;
+    }
+
+    return *this;
+}
+
+
+/** Convert number to string in the given base. */
+StringBuilder & StringBuilder::number( uint i, int base )
+{
+    nvCheck( base >= 2 );
+    nvCheck( base <= 36 );
+
+    // @@ This needs to be done correctly.
+    // length = floor(log(i, base));
+    uint len = uint(log(float(i)) / log(float(base)) - 0.5f + 1);
+    reserve(len);
+
+    *i2a(i, m_str, base) = 0;
+
+    return *this;
+}
+
+
+/** Resize the string preserving the contents. */
+StringBuilder & StringBuilder::reserve( uint size_hint )
+{
+    nvCheck(size_hint != 0);
+    if (size_hint > m_size) {
+        m_str = strReAlloc(m_str, size_hint);
+        m_size = size_hint;
+    }
+    return *this;
+}
+
+
+/** Copy a string safely. */
+StringBuilder & StringBuilder::copy(const char * s)
+{
+    nvCheck( s != NULL );
+    const uint str_size = uint(strlen( s )) + 1;
+    reserve(str_size);
+    memcpy(m_str, s, str_size);
+    return *this;
+}
+
+/** Copy a string safely. */
+StringBuilder & StringBuilder::copy(const char * s, uint len)
+{
+    nvCheck( s != NULL );
+    const uint str_size = len + 1;
+    reserve(str_size);
+    strCpy(m_str, str_size, s, len);
+    return *this;
+}
+
+
+/** Copy an StringBuilder. */
+StringBuilder & StringBuilder::copy( const StringBuilder & s )
+{
+    if (s.m_str == NULL) {
+        nvCheck( s.m_size == 0 );
+        reset();
+    }
+    else {
+        reserve( s.m_size );
+        strCpy( m_str, s.m_size, s.m_str );
+    }
+    return *this;
+}
+
+bool StringBuilder::endsWith(const char * str) const
+{
+    uint l = uint(strlen(str));
+    uint ml = uint(strlen(m_str));
+    if (ml < l) return false;
+    return strncmp(m_str + ml - l, str, l) == 0;
+}
+
+bool StringBuilder::beginsWith(const char * str) const 
+{
+    size_t l = strlen(str);
+    return strncmp(m_str, str, l) == 0;
+}
+
+// Find given char starting from the end.
+char * StringBuilder::reverseFind(char c)
+{
+    int length = (int)strlen(m_str) - 1;
+    while (length >= 0 && m_str[length] != c) {
+        length--;
+    }
+    if (length >= 0) {
+        return m_str + length;
+    }
+    else {
+        return NULL;
+    }
+}
+
+
+/** Reset the string. */
+void StringBuilder::reset()
+{
+    m_size = 0;
+    strFree( m_str );
+    m_str = NULL;
+}
+
+/** Release the allocated string. */
+char * StringBuilder::release()
+{
+    char * str = m_str;
+    m_size = 0;
+    m_str = NULL;
+    return str;
+}
+
+// Take ownership of string.
+void StringBuilder::acquire(char * str)
+{
+    if (str) {
+        m_size = strLen(str) + 1;
+        m_str = str;
+    }
+    else {
+        m_size = 0;
+        m_str = NULL;
+    }
+}
+
+// Swap strings.
+void nv::swap(StringBuilder & a, StringBuilder & b) {
+    swap(a.m_size, b.m_size);
+    swap(a.m_str, b.m_str);
+}
+
+
+/// Get the file name from a path.
+const char * Path::fileName() const
+{
+    return fileName(m_str);
+}
+
+
+/// Get the extension from a file path.
+const char * Path::extension() const
+{
+    return extension(m_str);
+}
+
+
+/*static */void Path::translatePath(char * path, char pathSeparator/*= NV_PATH_SEPARATOR*/) {
+    if (path != NULL) {
+        for (int i = 0;; i++) {
+            if (path[i] == '\0') break;
+            if (path[i] == '\\' || path[i] == '/') path[i] = pathSeparator;
+        }
+    }
+}
+
+/// Toggles path separators (ie. \\ into /).
+void Path::translatePath(char pathSeparator/*=NV_PATH_SEPARATOR*/)
+{
+    if (!isNull()) {
+        translatePath(m_str, pathSeparator);
+    }
+}
+
+void Path::appendSeparator(char pathSeparator/*=NV_PATH_SEPARATOR*/)
+{
+    nvCheck(!isNull());
+
+    const uint l = length();
+    
+    if (m_str[l] != '\\' && m_str[l] != '/') {
+        char separatorString[] = { pathSeparator, '\0' };
+        append(separatorString);
+    }
+}
+
+
+/**
+* Strip the file name from a path.
+* @warning path cannot end with '/' o '\\', can't it?
+*/
+void Path::stripFileName()
+{
+    nvCheck( m_str != NULL );
+
+    int length = (int)strlen(m_str) - 1;
+    while (length > 0 && m_str[length] != '/' && m_str[length] != '\\'){
+        length--;
+    }
+    if( length ) {
+        m_str[length+1] = 0;
+    }
+    else {
+        m_str[0] = 0;
+    }
+}
+
+
+/// Strip the extension from a path name.
+void Path::stripExtension()
+{
+    nvCheck( m_str != NULL );
+
+    int length = (int)strlen(m_str) - 1;
+    while (length > 0 && m_str[length] != '.') {
+        length--;
+        if( m_str[length] == NV_PATH_SEPARATOR ) {
+            return; // no extension
+        }
+    }
+    if (length > 0) {
+        m_str[length] = 0;
+    }
+}
+
+
+/// Get the path separator.
+// static
+char Path::separator()
+{
+    return NV_PATH_SEPARATOR;
+}
+
+// static 
+const char * Path::fileName(const char * str)
+{
+    nvCheck( str != NULL );
+
+    int length = (int)strlen(str) - 1;
+    while (length >= 0 && str[length] != '\\' && str[length] != '/') {
+        length--;
+    }
+
+    return &str[length+1];
+}
+
+// static 
+const char * Path::extension(const char * str)
+{
+    nvCheck( str != NULL );
+
+    int length, l;
+    l = length = (int)strlen( str );
+    while (length > 0 && str[length] != '.') {
+        length--;
+        if (str[length] == '\\' || str[length] == '/') {
+            return &str[l]; // no extension
+        }
+    }
+    if (length == 0) {
+        return &str[l];
+    }
+    return &str[length];
+}
+
+
+
+/// Clone this string
+String String::clone() const
+{
+    String str(data);
+    return str;
+}
+
+void String::setString(const char * str)
+{
+    if (str == NULL) {
+        data = NULL;
+    }
+    else {
+        allocString( str );
+        addRef();
+    }
+}
+
+void String::setString(const char * str, uint length)
+{
+    nvDebugCheck(str != NULL);
+
+    allocString(str, length);
+    addRef();
+}
+
+void String::setString(const StringBuilder & str)
+{
+    if (str.str() == NULL) {
+        data =	NULL;
+    }
+    else {
+        allocString(str.str());
+        addRef();
+    }
+}	
+
+// Add reference count.
+void String::addRef()
+{
+    if (data != NULL)
+    {
+        setRefCount(getRefCount() + 1);
+    }
+}
+
+// Decrease reference count.
+void String::release()
+{
+    if (data != NULL)
+    {
+        const uint16 count = getRefCount();
+        setRefCount(count - 1);
+        if (count - 1 == 0) {
+            free(data - 2);
+            data = NULL;
+        }
+    }
+}
+
+void String::allocString(const char * str, uint len)
+{
+    const char * ptr = malloc<char>(2 + len + 1);
+
+    setData( ptr );
+    setRefCount( 0 );
+
+    // Copy string.
+    strCpy(const_cast<char *>(data), len+1, str, len);
+
+    // Add terminating character.
+    const_cast<char *>(data)[len] = '\0';
+}
+
+void nv::swap(String & a, String & b) {
+    swap(a.data, b.data);
+}
diff --git a/thirdparty/thekla_atlas/nvcore/StrLib.h b/thirdparty/thekla_atlas/nvcore/StrLib.h
new file mode 100644
index 0000000000..ae4b5d12a0
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/StrLib.h
@@ -0,0 +1,433 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_STRING_H
+#define NV_CORE_STRING_H
+
+#include "Debug.h"
+#include "Hash.h" // hash
+
+//#include <string.h> // strlen, etc.
+
+#if NV_OS_WIN32
+#define NV_PATH_SEPARATOR '\\'
+#else
+#define NV_PATH_SEPARATOR '/'
+#endif
+
+namespace nv
+{
+
+    NVCORE_API uint strHash(const char * str, uint h) NV_PURE;
+
+    /// String hash based on Bernstein's hash.
+    inline uint strHash(const char * data, uint h = 5381)
+    {
+        uint i = 0;
+        while(data[i] != 0) {
+            h = (33 * h) ^ uint(data[i]);
+            i++;
+        }
+        return h;
+    }
+
+    template <> struct Hash<const char *> {
+        uint operator()(const char * str) const { return strHash(str); }
+    };
+
+    NVCORE_API uint strLen(const char * str) NV_PURE;                       // Asserts on NULL strings.
+
+    NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE;       // Asserts on NULL strings.
+    NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE;   // Asserts on NULL strings.
+    NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE;     // Accepts NULL strings.
+    NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
+
+    template <> struct Equal<const char *> {
+        bool operator()(const char * a, const char * b) const { return strEqual(a, b); }
+    };
+
+    NVCORE_API bool strBeginsWith(const char * dst, const char * prefix) NV_PURE;
+    NVCORE_API bool strEndsWith(const char * dst, const char * suffix) NV_PURE;
+
+
+    NVCORE_API void strCpy(char * dst, uint size, const char * src);
+    NVCORE_API void strCpy(char * dst, uint size, const char * src, uint len);
+    NVCORE_API void strCat(char * dst, uint size, const char * src);
+
+    NVCORE_API const char * strSkipWhiteSpace(const char * str);
+    NVCORE_API char * strSkipWhiteSpace(char * str);
+
+    NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
+
+    NVCORE_API bool isNumber(const char * str) NV_PURE;
+
+    /* @@ Implement these two functions and modify StringBuilder to use them?
+    NVCORE_API void strFormat(const char * dst, const char * fmt, ...);
+    NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg);
+
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3)));
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        strFormatList(buffer, count, fmt, args);
+        va_end(args);
+    }
+    template <size_t count> void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) {
+        va_list tmp;
+        va_copy(tmp, args);
+        strFormatList(buffer, count, fmt, tmp);
+        va_end(tmp);
+    }*/
+
+    template <int count> void strCpySafe(char (&buffer)[count], const char *src) {
+        strCpy(buffer, count, src);
+    }
+
+    template <int count> void strCatSafe(char (&buffer)[count], const char * src) {
+        strCat(buffer, count, src);
+    }
+
+
+
+    /// String builder.
+    class NVCORE_CLASS StringBuilder
+    {
+    public:
+
+        StringBuilder();
+        explicit StringBuilder( uint size_hint );
+        StringBuilder(const char * str);
+        StringBuilder(const char * str, uint len);
+        StringBuilder(const StringBuilder & other);
+
+        ~StringBuilder();
+
+        StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+        StringBuilder & formatList( const char * format, va_list arg );
+
+        StringBuilder & append(char c);
+        StringBuilder & append(const char * str);
+        StringBuilder & append(const char * str, uint len);
+        StringBuilder & append(const StringBuilder & str);
+        StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3)));
+        StringBuilder & appendFormatList(const char * format, va_list arg);
+
+        StringBuilder & appendSpace(uint n);
+
+        StringBuilder & number( int i, int base = 10 );
+        StringBuilder & number( uint i, int base = 10 );
+
+        StringBuilder & reserve(uint size_hint);
+        StringBuilder & copy(const char * str);
+        StringBuilder & copy(const char * str, uint len);
+        StringBuilder & copy(const StringBuilder & str);
+
+        StringBuilder & toLower();
+        StringBuilder & toUpper();
+
+        bool endsWith(const char * str) const;
+        bool beginsWith(const char * str) const;
+
+        char * reverseFind(char c);
+
+        void reset();
+        bool isNull() const { return m_size == 0; }
+
+        // const char * accessors
+        //operator const char * () const { return m_str; }
+        //operator char * () { return m_str; }
+        const char * str() const { return m_str; }
+        char * str() { return m_str; }
+
+        char * release();       // Release ownership of string.
+        void acquire(char *);   // Take ownership of string.
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const StringBuilder & s ) {
+            return copy(s);
+        }
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const char * s ) {
+            return copy(s);
+        }
+
+        /// Equal operator.
+        bool operator==( const StringBuilder & s ) const {
+            return strMatch(s.m_str, m_str);
+        }
+
+        /// Return the exact length.
+        uint length() const { return isNull() ? 0 : strLen(m_str); }
+
+        /// Return the size of the string container.
+        uint capacity() const { return m_size; }
+
+        /// Return the hash of the string.
+        uint hash() const { return isNull() ? 0 : strHash(m_str); }
+
+        // Swap strings.
+        friend void swap(StringBuilder & a, StringBuilder & b);
+
+    protected:
+
+        /// Size of the string container.
+        uint m_size;
+
+        /// String.
+        char * m_str;
+
+    };
+
+
+    /// Path string. @@ This should be called PathBuilder.
+    class NVCORE_CLASS Path : public StringBuilder
+    {
+    public:
+        Path() : StringBuilder() {}
+        explicit Path(int size_hint) : StringBuilder(size_hint) {}
+        Path(const char * str) : StringBuilder(str) {}
+        Path(const Path & path) : StringBuilder(path) {}
+
+        const char * fileName() const;
+        const char * extension() const;
+
+        void translatePath(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void appendSeparator(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void stripFileName();
+        void stripExtension();
+
+        // statics
+        NVCORE_API static char separator();
+        NVCORE_API static const char * fileName(const char *);
+        NVCORE_API static const char * extension(const char *);
+
+        NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
+    };
+
+
+    /// String class.
+    class NVCORE_CLASS String
+    {
+    public:
+
+        /// Constructs a null string. @sa isNull()
+        String()
+        {
+            data = NULL;
+        }
+
+        /// Constructs a shared copy of str.
+        String(const String & str)
+        {
+            data = str.data;
+            if (data != NULL) addRef();
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str)
+        {
+            setString(str);
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str, int length)
+        {
+            setString(str, length);
+        }
+
+        /// Constructs a shared string from a StringBuilder.
+        String(const StringBuilder & str)
+        {
+            setString(str);
+        }
+
+        /// Dtor.
+        ~String()
+        {
+            release();
+        }
+
+        String clone() const;
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const char * str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const StringBuilder & str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Implement value semantics.
+        String & operator=( const String & str )
+        {
+            if (str.data != data)
+            {
+                release();
+                data = str.data;
+                addRef();
+            }
+            return *this;
+        }
+
+        /// Equal operator.
+        bool operator==( const String & str ) const
+        {
+            return strMatch(str.data, data);
+        }
+
+        /// Equal operator.
+        bool operator==( const char * str ) const
+        {
+            return strMatch(str, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const String & str ) const
+        {
+            return !strMatch(str.data, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const char * str ) const
+        {
+            return !strMatch(str, data);
+        }
+
+        /// Returns true if this string is the null string.
+        bool isNull() const { return data == NULL; }
+
+        /// Return the exact length.
+        uint length() const { nvDebugCheck(data != NULL); return strLen(data); }
+
+        /// Return the hash of the string.
+        uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
+
+        /// const char * cast operator.
+        operator const char * () const { return data; }
+
+        /// Get string pointer.
+        const char * str() const { return data; }
+
+
+    private:
+
+        // Add reference count.
+        void addRef();
+
+        // Decrease reference count.
+        void release();
+
+        uint16 getRefCount() const
+        {
+            nvDebugCheck(data != NULL);
+            return *reinterpret_cast<const uint16 *>(data - 2);
+        }
+
+        void setRefCount(uint16 count) {
+            nvDebugCheck(data != NULL);
+            nvCheck(count < 0xFFFF);
+            *reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
+        }
+
+        void setData(const char * str) {
+            data = str + 2;
+        }
+
+        void allocString(const char * str)
+        {
+            allocString(str, strLen(str));
+        }
+
+        void allocString(const char * str, uint length);
+
+        void setString(const char * str);
+        void setString(const char * str, uint length);
+        void setString(const StringBuilder & str);
+
+        // Swap strings.
+        friend void swap(String & a, String & b);
+
+    private:
+
+        const char * data;
+
+    };
+
+    template <> struct Hash<String> {
+        uint operator()(const String & str) const { return str.hash(); }
+    };
+
+
+    // Like AutoPtr, but for const char strings.
+    class AutoString
+    {
+        NV_FORBID_COPY(AutoString);
+        NV_FORBID_HEAPALLOC();
+    public:
+
+        // Ctor.
+        AutoString(const char * p = NULL) : m_ptr(p) { }
+
+#if NV_CC_CPP11
+        // Move ctor.
+        AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; }
+#endif
+        
+        // Dtor. Deletes owned pointer.
+        ~AutoString() {
+            delete [] m_ptr;
+            m_ptr = NULL;
+        }
+
+        // Delete owned pointer and assign new one.
+        void operator=(const char * p) {
+            if (p != m_ptr) 
+            {
+                delete [] m_ptr;
+                m_ptr = p;
+            }
+        }
+
+        // Get pointer.
+        const char * ptr() const { return m_ptr; }
+        operator const char *() const { return m_ptr; }
+
+        // Relinquish ownership of the underlying pointer and returns that pointer.
+        const char * release() {
+            const char * tmp = m_ptr;
+            m_ptr = NULL;
+            return tmp;
+        }
+
+        // comparison operators.
+        friend bool operator == (const AutoString & ap, const char * const p) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const AutoString & ap, const char * const p) {
+            return (ap.ptr() != p);
+        }
+        friend bool operator == (const char * const p, const AutoString & ap) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const char * const p, const AutoString & ap) {
+            return (ap.ptr() != p);
+        }
+
+    private:
+        const char * m_ptr;
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STRING_H
diff --git a/thirdparty/thekla_atlas/nvcore/Stream.h b/thirdparty/thekla_atlas/nvcore/Stream.h
new file mode 100644
index 0000000000..c35c0d0c78
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Stream.h
@@ -0,0 +1,164 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_STREAM_H
+#define NV_CORE_STREAM_H
+
+#include "nvcore.h"
+#include "Debug.h"
+
+namespace nv
+{
+
+    /// Base stream class.
+    class NVCORE_CLASS Stream {
+    public:
+
+        enum ByteOrder {
+            LittleEndian = false,
+            BigEndian = true,
+        };
+
+        /// Get the byte order of the system.
+        static ByteOrder getSystemByteOrder() { 
+#if NV_LITTLE_ENDIAN
+            return LittleEndian;
+#else
+            return BigEndian;
+#endif
+        }
+
+
+        /// Ctor.
+        Stream() : m_byteOrder(LittleEndian) { }
+
+        /// Virtual destructor.
+        virtual ~Stream() {}
+
+        /// Set byte order.
+        void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+
+        /// Get byte order.
+        ByteOrder byteOrder() const { return m_byteOrder; }
+
+
+        /// Serialize the given data.
+        virtual uint serialize( void * data, uint len ) = 0;
+
+        /// Move to the given position in the archive.
+        virtual void seek( uint pos ) = 0;
+
+        /// Return the current position in the archive.
+        virtual uint tell() const = 0;
+
+        /// Return the current size of the archive.
+        virtual uint size() const = 0;
+
+        /// Determine if there has been any error.
+        virtual bool isError() const = 0;
+
+        /// Clear errors.
+        virtual void clearError() = 0;
+
+        /// Return true if the stream is at the end.
+        virtual bool isAtEnd() const = 0;
+
+        /// Return true if the stream is seekable.
+        virtual bool isSeekable() const = 0;
+
+        /// Return true if this is an input stream.
+        virtual bool isLoading() const = 0;
+
+        /// Return true if this is an output stream.
+        virtual bool isSaving() const = 0;
+
+
+        void advance(uint offset) { seek(tell() + offset); }
+
+
+        // friends
+        friend Stream & operator<<( Stream & s, bool & c ) {
+#if NV_OS_DARWIN && !NV_CC_CPP11
+            nvStaticCheck(sizeof(bool) == 4);
+            uint8 b = c ? 1 : 0;
+            s.serialize( &b, 1 );
+            c = (b != 0);
+#else
+            nvStaticCheck(sizeof(bool) == 1);
+            s.serialize( &c, 1 );
+#endif
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, char & c ) {
+            nvStaticCheck(sizeof(char) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint8 & c ) {
+            nvStaticCheck(sizeof(uint8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, int8 & c ) {
+            nvStaticCheck(sizeof(int8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint16 & c ) {
+            nvStaticCheck(sizeof(uint16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, int16 & c ) {
+            nvStaticCheck(sizeof(int16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, uint32 & c ) {
+            nvStaticCheck(sizeof(uint32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, int32 & c ) {
+            nvStaticCheck(sizeof(int32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, uint64 & c ) {
+            nvStaticCheck(sizeof(uint64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, int64 & c ) {
+            nvStaticCheck(sizeof(int64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, float & c ) {
+            nvStaticCheck(sizeof(float) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, double & c ) {
+            nvStaticCheck(sizeof(double) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+
+    protected:
+
+        /// Serialize in the stream byte order.
+        Stream & byteOrderSerialize( void * v, uint len ) {
+            if( m_byteOrder == getSystemByteOrder() ) {
+                serialize( v, len );
+            }
+            else {
+                for( uint i = len; i > 0; i-- ) {
+                    serialize( (uint8 *)v + i - 1, 1 );
+                }
+            }
+            return *this;
+        }
+
+
+    private:
+
+        ByteOrder m_byteOrder;
+
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STREAM_H
diff --git a/thirdparty/thekla_atlas/nvcore/Utils.h b/thirdparty/thekla_atlas/nvcore/Utils.h
new file mode 100644
index 0000000000..f20e42cda8
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/Utils.h
@@ -0,0 +1,315 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_UTILS_H
+#define NV_CORE_UTILS_H
+
+#include "Debug.h" // nvDebugCheck
+
+#include <new> // for placement new
+
+
+// Just in case. Grrr.
+#undef min
+#undef max
+
+#define NV_INT8_MIN    (-128)
+#define NV_INT8_MAX    127
+#define NV_UINT8_MAX    255
+#define NV_INT16_MIN    (-32767-1)
+#define NV_INT16_MAX    32767
+#define NV_UINT16_MAX   0xffff
+#define NV_INT32_MIN    (-2147483647-1)
+#define NV_INT32_MAX    2147483647
+#define NV_UINT32_MAX   0xffffffff
+#define NV_INT64_MAX    POSH_I64(9223372036854775807)
+#define NV_INT64_MIN    (-POSH_I64(9223372036854775807)-1)
+#define NV_UINT64_MAX   POSH_U64(0xffffffffffffffff)
+
+#define NV_HALF_MAX     65504.0F
+#define NV_FLOAT_MAX    3.402823466e+38F
+
+#define NV_INTEGER_TO_FLOAT_MAX  16777217     // Largest integer such that it and all smaller integers can be stored in a 32bit float.
+
+
+namespace nv
+{
+    // Less error prone than casting. From CB:
+    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
+
+    // These intentionally look like casts.
+
+    // uint64 casts:
+    template <typename T> inline uint64 U64(T x) { return x; }
+    //template <> inline uint64 U64<uint64>(uint64 x) { return x; }
+    template <> inline uint64 U64<int64>(int64 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+    //template <> inline uint64 U32<uint32>(uint32 x) { return x; }
+    template <> inline uint64 U64<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+    //template <> inline uint64 U64<uint16>(uint16 x) { return x; }
+    template <> inline uint64 U64<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+    //template <> inline uint64 U64<uint8>(uint8 x) { return x; }
+    template <> inline uint64 U64<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint64)x; }
+
+    // int64 casts:
+    template <typename T> inline int64 I64(T x) { return x; }
+    template <> inline int64 I64<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT64_MAX); return (int64)x; }
+    //template <> inline uint64 U64<int64>(int64 x) { return x; }
+    //template <> inline uint64 U32<uint32>(uint32 x) { return x; }
+    //template <> inline uint64 U64<int32>(int32 x) { return x; }
+    //template <> inline uint64 U64<uint16>(uint16 x) { return x; }
+    //template <> inline uint64 U64<int16>(int16 x) { return x; }
+    //template <> inline uint64 U64<uint8>(uint8 x) { return x; }
+    //template <> inline uint64 U64<int8>(int8 x) { return x; }
+
+    // uint32 casts:
+    template <typename T> inline uint32 U32(T x) { return x; }
+    template <> inline uint32 U32<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
+    template <> inline uint32 U32<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; }
+    //template <> inline uint32 U32<uint32>(uint32 x) { return x; }
+    template <> inline uint32 U32<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint16>(uint16 x) { return x; }
+    template <> inline uint32 U32<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint8>(uint8 x) { return x; }
+    template <> inline uint32 U32<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+
+    // int32 casts:
+    template <typename T> inline int32 I32(T x) { return x; }
+    template <> inline int32 I32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    template <> inline int32 I32<int64>(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; }
+    template <> inline int32 I32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    //template <> inline int32 I32<int32>(int32 x) { return x; }
+    //template <> inline int32 I32<uint16>(uint16 x) { return x; }
+    //template <> inline int32 I32<int16>(int16 x) { return x; }
+    //template <> inline int32 I32<uint8>(uint8 x) { return x; }
+    //template <> inline int32 I32<int8>(int8 x) { return x; }
+
+    // uint16 casts:
+    template <typename T> inline uint16 U16(T x) { return x; }
+    template <> inline uint16 U16<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    //template <> inline uint16 U16<uint16>(uint16 x) { return x; }
+    template <> inline uint16 U16<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+    //template <> inline uint16 U16<uint8>(uint8 x) { return x; }
+    template <> inline uint16 U16<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+
+    // int16 casts:
+    template <typename T> inline int16 I16(T x) { return x; }
+    template <> inline int16 I16<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int64>(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int32>(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    //template <> inline int16 I16<int16>(int16 x) { return x; }
+    //template <> inline int16 I16<uint8>(uint8 x) { return x; }
+    //template <> inline int16 I16<int8>(int8 x) { return x; }
+
+    // uint8 casts:
+    template <typename T> inline uint8 U8(T x) { return x; }
+    template <> inline uint8 U8<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint16>(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int16>(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    //template <> inline uint8 U8<uint8>(uint8 x) { return x; }
+    template <> inline uint8 U8<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; }
+    //template <> inline uint8 U8<float>(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; }
+
+    // int8 casts:
+    template <typename T> inline int8 I8(T x) { return x; }
+    template <> inline int8 I8<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int64>(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int32>(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int16>(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint8>(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    //template <> inline int8 I8<int8>(int8 x) { return x; }
+
+    // float casts:
+    template <typename T> inline float F32(T x) { return x; }
+    template <> inline float F32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int64>(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int32>(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    // The compiler should not complain about these conversions:
+    //template <> inline float F32<uint16>(uint16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int16>(int16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<uint8>(uint8 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int8>(int8 x) { nvDebugCheck(return (float)x; }
+
+
+    /// Swap two values.
+    template <typename T> 
+    inline void swap(T & a, T & b)
+    {
+        T temp(a);
+        a = b; 
+        b = temp;
+    }
+
+    /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN.
+    template <typename T> 
+    //inline const T & max(const T & a, const T & b)
+    inline T max(const T & a, const T & b)
+    {
+        return (b < a) ? a : b;
+    }
+
+	/// Return the maximum of the four arguments.
+	template <typename T> 
+	//inline const T & max4(const T & a, const T & b, const T & c)
+	inline T max4(const T & a, const T & b, const T & c, const T & d)
+	{
+		return max(max(a, b), max(c, d));
+	}
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & max3(const T & a, const T & b, const T & c)
+    inline T max3(const T & a, const T & b, const T & c)
+    {
+        return max(a, max(b, c));
+    }
+
+    /// Return the minimum of two values.
+    template <typename T> 
+    //inline const T & min(const T & a, const T & b)
+    inline T min(const T & a, const T & b)
+    {
+        return (a < b) ? a : b;
+    }
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & min3(const T & a, const T & b, const T & c)
+    inline T min3(const T & a, const T & b, const T & c)
+    {
+        return min(a, min(b, c));
+    }
+
+    /// Clamp between two values.
+    template <typename T> 
+    //inline const T & clamp(const T & x, const T & a, const T & b)
+    inline T clamp(const T & x, const T & a, const T & b)
+    {
+        return min(max(x, a), b);
+    }
+
+    /** Return the next power of two. 
+    * @see http://graphics.stanford.edu/~seander/bithacks.html
+    * @warning Behaviour for 0 is undefined.
+    * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
+    * @note nextPowerOfTwo(x) = 2 << log2(x-1)
+    */
+    inline uint32 nextPowerOfTwo(uint32 x)
+    {
+        nvDebugCheck( x != 0 );
+#if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
+        x--;
+        x |= x >> 1;
+        x |= x >> 2;
+        x |= x >> 4;
+        x |= x >> 8;
+        x |= x >> 16;
+        return x+1;	
+#else
+        uint p = 1;
+        while( x > p ) {
+            p += p;
+        }
+        return p;
+#endif
+    }
+
+    inline uint64 nextPowerOfTwo(uint64 x)
+    {
+        nvDebugCheck(x != 0);
+        uint p = 1;
+        while (x > p) {
+            p += p;
+        }
+        return p;
+    }
+
+    // @@ Should I just use a macro instead?
+    template <typename T>
+    inline bool isPowerOfTwo(T n)
+    {
+        return (n & (n-1)) == 0;
+    }
+
+
+    // @@ Move this to utils?
+    /// Delete all the elements of a container.
+    template <typename T>
+    void deleteAll(T & container)
+    {
+        for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
+        {
+            delete container[i];
+        }
+    }
+
+
+
+    // @@ Specialize these methods for numeric, pointer, and pod types.
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T; // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T & elem) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(elem); // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(src[i]); // placement new
+        }
+    }
+
+    template <typename T>
+    void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = new_size; i < old_size; i++) {
+            (ptr+i)->~T(); // Explicit call to the destructor
+        }
+    }
+
+    template <typename T>
+    void fill(T * restrict dst, uint count, const T & value) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = value;
+        }
+    }
+
+    template <typename T>
+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = src[i];
+        }
+    }
+
+    template <typename T>
+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
+        for (uint i = begin; i < end; i++) {
+            if (ptr[i] == element) {
+                if (index != NULL) *index = i;
+                return true;
+            }
+        }
+        return false;
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_UTILS_H
diff --git a/thirdparty/thekla_atlas/nvcore/nvcore.h b/thirdparty/thekla_atlas/nvcore/nvcore.h
new file mode 100644
index 0000000000..a3deb66be2
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/nvcore.h
@@ -0,0 +1,357 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_H
+#define NV_CORE_H
+
+// Function linkage
+#if NVCORE_SHARED
+#ifdef NVCORE_EXPORTS
+#define NVCORE_API DLL_EXPORT
+#define NVCORE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVCORE_API DLL_IMPORT
+#define NVCORE_CLASS DLL_IMPORT
+#endif
+#else // NVCORE_SHARED
+#define NVCORE_API
+#define NVCORE_CLASS
+#endif // NVCORE_SHARED
+
+
+// Platform definitions
+#include <posh.h>
+
+// OS:
+// NV_OS_WIN32
+// NV_OS_WIN64
+// NV_OS_MINGW
+// NV_OS_CYGWIN
+// NV_OS_LINUX
+// NV_OS_UNIX
+// NV_OS_DARWIN
+// NV_OS_XBOX
+// NV_OS_ORBIS
+// NV_OS_IOS
+
+#define NV_OS_STRING POSH_OS_STRING
+
+#if defined POSH_OS_LINUX
+#   define NV_OS_LINUX 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_ORBIS
+#   define NV_OS_ORBIS 1
+#elif defined POSH_OS_FREEBSD
+#   define NV_OS_FREEBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_OPENBSD
+#   define NV_OS_OPENBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_CYGWIN32
+#   define NV_OS_CYGWIN 1
+#elif defined POSH_OS_MINGW
+#   define NV_OS_MINGW 1
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_OSX
+#   define NV_OS_OSX 1      // IC: Adding this, because iOS defines NV_OS_DARWIN too.
+#   define NV_OS_DARWIN 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_IOS
+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
+#   define NV_OS_UNIX 1
+#   define NV_OS_IOS 1
+#elif defined POSH_OS_UNIX
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_WIN64
+#   define NV_OS_WIN32 1
+#   define NV_OS_WIN64 1
+#elif defined POSH_OS_WIN32
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_XBOX
+#   define NV_OS_XBOX 1
+#elif defined POSH_OS_DURANGO
+#   define NV_OS_DURANGO 1
+#else
+#   error "Unsupported OS"
+#endif
+
+
+// Is this a console OS? (i.e. connected to a TV)
+#if NV_OS_ORBIS || NV_OS_XBOX || NV_OS_DURANGO
+#   define NV_OS_CONSOLE 1
+#endif 
+
+
+// Threading:
+// some platforms don't implement __thread or similar for thread-local-storage
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#   define NV_OS_USE_PTHREAD 1
+#   if NV_OS_IOS
+#       define NV_OS_HAS_TLS_QUALIFIER 0
+#   else
+#       define NV_OS_HAS_TLS_QUALIFIER 1
+#   endif
+#else
+#   define NV_OS_USE_PTHREAD 0
+#   define NV_OS_HAS_TLS_QUALIFIER 1
+#endif
+
+
+// CPUs:
+// NV_CPU_X86
+// NV_CPU_X86_64
+// NV_CPU_PPC
+// NV_CPU_ARM
+
+#define NV_CPU_STRING   POSH_CPU_STRING
+
+#if defined POSH_CPU_X86_64
+//#   define NV_CPU_X86 1
+#   define NV_CPU_X86_64 1
+#elif defined POSH_CPU_X86
+#   define NV_CPU_X86 1
+#elif defined POSH_CPU_PPC
+#   define NV_CPU_PPC 1
+#elif defined POSH_CPU_STRONGARM
+#   define NV_CPU_ARM 1
+#else
+#   error "Unsupported CPU"
+#endif
+
+
+// Compiler:
+// NV_CC_GNUC
+// NV_CC_MSVC
+// NV_CC_CLANG
+
+#if defined POSH_COMPILER_CLANG
+#   define NV_CC_CLANG  1
+#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
+#   define NV_CC_STRING "clang"
+#elif defined POSH_COMPILER_GCC
+#   define NV_CC_GNUC   1
+#   define NV_CC_STRING "gcc"
+#elif defined POSH_COMPILER_MSVC
+#   define NV_CC_MSVC   1
+#   define NV_CC_STRING "msvc"
+#else
+#   error "Unsupported compiler"
+#endif
+
+#if NV_CC_MSVC
+#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet.
+#else
+// @@ IC: This works in CLANG, about GCC?
+// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4.
+#ifdef __clang__
+#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert))
+#elif defined __GNUC__ 
+#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+#endif
+#endif
+
+// Endiannes:
+#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
+#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
+#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
+
+
+// Define the right printf prefix for size_t arguments:
+#if POSH_64BIT_POINTER
+#  define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX
+#else
+#  define NV_SIZET_PRINTF_PREFIX
+#endif
+
+
+// cmake config
+#include "nvconfig.h"
+
+#if NV_OS_DARWIN
+#include <stdint.h>
+//#include <inttypes.h>
+
+// Type definitions:
+typedef uint8_t     uint8;
+typedef int8_t      int8;
+
+typedef uint16_t    uint16;
+typedef int16_t     int16;
+
+typedef uint32_t    uint32;
+typedef int32_t     int32;
+
+typedef uint64_t    uint64;
+typedef int64_t     int64;
+
+// POSH gets this wrong due to __LP64__
+#undef POSH_I64_PRINTF_PREFIX
+#define POSH_I64_PRINTF_PREFIX "ll"
+
+#else
+
+// Type definitions:
+typedef posh_u8_t   uint8;
+typedef posh_i8_t   int8;
+
+typedef posh_u16_t  uint16;
+typedef posh_i16_t  int16;
+
+typedef posh_u32_t  uint32;
+typedef posh_i32_t  int32;
+
+//#if NV_OS_DARWIN
+// OSX-64 is supposed to be LP64 (longs and pointers are 64 bits), thus uint64 is defined as 
+// unsigned long. However, some OSX headers define it as unsigned long long, producing errors,
+// even though both types are 64 bit. Ideally posh should handle that, but it has not been
+// updated in ages, so here I'm just falling back to the standard C99 types defined in inttypes.h
+//#include <inttypes.h>
+//typedef posh_u64_t  uint64_t;
+//typedef posh_i64_t  int64_t;
+//#else
+typedef posh_u64_t  uint64;
+typedef posh_i64_t  int64;
+//#endif
+#if NV_OS_DARWIN
+// To avoid duplicate definitions.
+#define _UINT64
+#endif
+#endif
+
+// Aliases
+typedef uint32      uint;
+
+
+// Version string:
+#define NV_VERSION_STRING \
+    NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
+    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
+
+
+// Disable copy constructor and assignment operator. 
+#if NV_CC_CPP11
+#define NV_FORBID_COPY(C) \
+    C( const C & ) = delete; \
+    C &operator=( const C & ) = delete
+#else
+#define NV_FORBID_COPY(C) \
+    private: \
+    C( const C & ); \
+    C &operator=( const C & )
+#endif
+
+// Disable dynamic allocation on the heap. 
+// See Prohibiting Heap-Based Objects in More Effective C++.
+#define NV_FORBID_HEAPALLOC() \
+    private: \
+    void *operator new(size_t size); \
+    void *operator new[](size_t size)
+    //static void *operator new(size_t size); \
+    //static void *operator new[](size_t size);
+
+// String concatenation macros.
+#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
+#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
+#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
+#define NV_STRING2(x) #x
+#define NV_STRING(x) NV_STRING2(x)
+
+#if NV_CC_MSVC
+#define NV_MULTI_LINE_MACRO_BEGIN do {  
+#define NV_MULTI_LINE_MACRO_END \
+    __pragma(warning(push)) \
+    __pragma(warning(disable:4127)) \
+    } while(false) \
+    __pragma(warning(pop))  
+#else
+#define NV_MULTI_LINE_MACRO_BEGIN do {
+#define NV_MULTI_LINE_MACRO_END } while(false)
+#endif
+
+#if NV_CC_CPP11
+#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed")
+#else
+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+#endif
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+
+// Make sure type definitions are fine.
+NV_COMPILER_CHECK(sizeof(int8) == 1);
+NV_COMPILER_CHECK(sizeof(uint8) == 1);
+NV_COMPILER_CHECK(sizeof(int16) == 2);
+NV_COMPILER_CHECK(sizeof(uint16) == 2);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+
+#include <stddef.h> // for size_t
+template <typename T, size_t N> char (&ArraySizeHelper(T (&array)[N]))[N];
+#define NV_ARRAY_SIZE(x) sizeof(ArraySizeHelper(x))
+//#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+
+#if 0 // Disabled in The Witness.
+#if NV_CC_MSVC
+#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x)
+#else
+#define NV_MESSAGE(x) message(x)
+#endif
+#else
+#define NV_MESSAGE(x) 
+#endif
+
+
+// Startup initialization macro.
+#define NV_AT_STARTUP(some_code) \
+    namespace { \
+        static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
+            NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
+        } \
+        NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
+    }
+
+// Indicate the compiler that the parameter is not used to suppress compier warnings.
+#if NV_CC_MSVC
+#define NV_UNUSED(a) ((a)=(a))
+#else
+#define NV_UNUSED(a) _Pragma(NV_STRING(unused(a)))
+#endif
+
+// Null index. @@ Move this somewhere else... it's only used by nvmesh.
+//const unsigned int NIL = unsigned int(~0);
+#define NIL uint(~0)
+
+// Null pointer.
+#ifndef NULL
+#define NULL 0
+#endif
+
+// Platform includes
+#if NV_CC_MSVC
+#   if NV_OS_WIN32
+#       include "DefsVcWin32.h"
+#   elif NV_OS_XBOX
+#       include "DefsVcXBox.h"
+#   elif NV_OS_DURANGO
+#       include "DefsVcDurango.h"
+#   else
+#       error "MSVC: Platform not supported"
+#   endif
+#elif NV_CC_GNUC
+#   if NV_OS_LINUX
+#       include "DefsGnucLinux.h"
+#   elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#       include "DefsGnucDarwin.h"
+#   elif NV_OS_ORBIS
+#       include "DefsOrbis.h"
+#   elif NV_OS_MINGW
+#       include "DefsGnucWin32.h"
+#   elif NV_OS_CYGWIN
+#       error "GCC: Cygwin not supported"
+#   else
+#       error "GCC: Platform not supported"
+#   endif
+#endif
+
+#endif // NV_CORE_H
diff --git a/thirdparty/thekla_atlas/nvcore/scanf.c b/thirdparty/thekla_atlas/nvcore/scanf.c
new file mode 100644
index 0000000000..bf9d293154
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvcore/scanf.c
@@ -0,0 +1,641 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp 
+ * From: static char sccsid[] = "@(#)strtol.c	8.1 (Berkeley) 6/4/93";
+ * From: static char sccsid[] = "@(#)strtoul.c	8.1 (Berkeley) 6/4/93";
+ */
+
+#include <stdio.h> 
+#include <stdlib.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma warning(disable : 4244) // conversion from '*' to '*', possible loss of data
+#pragma warning(disable : 4018) // signed/unsigned mismatch
+#pragma warning(disable : 4267) // '=' : conversion from 'size_t' to 'int', possible loss of data
+
+#define strtoq _strtoi64
+#define strtouq _strtoui64
+#define bcopy(b1,b2,len) (memmove((b2), (b1), (len)), (void) 0)
+
+typedef int long long quad_t;
+typedef unsigned long long u_quad_t;
+typedef unsigned char u_char;
+
+#define	BUF		32 	/* Maximum length of numeric string. */
+
+/*
+ * Flags used during conversion.
+ */
+#define	LONG		0x01	/* l: long or double */
+#define	SHORT		0x04	/* h: short */
+#define	SUPPRESS	0x08	/* suppress assignment */
+#define	POINTER		0x10	/* weird %p pointer (`fake hex') */
+#define	NOSKIP		0x20	/* do not skip blanks */
+#define	QUAD		0x400
+
+/*
+ * The following are used in numeric conversions only:
+ * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point;
+ * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral.
+ */
+#define	SIGNOK		0x40	/* +/- is (still) legal */
+#define	NDIGITS		0x80	/* no digits detected */
+
+#define	DPTOK		0x100	/* (float) decimal point is still legal */
+#define	EXPOK		0x200	/* (float) exponent (e+3, etc) still legal */
+
+#define	PFXOK		0x100	/* 0x prefix is (still) legal */
+#define	NZDIGITS	0x200	/* no zero digits detected */
+
+/*
+ * Conversion types.
+ */
+#define	CT_CHAR		0	/* %c conversion */
+#define	CT_CCL		1	/* %[...] conversion */
+#define	CT_STRING	2	/* %s conversion */
+#define	CT_INT		3	/* integer, i.e., strtoq or strtouq */
+typedef u_quad_t (*ccfntype)(const char *, char **, int);
+
+static const u_char *__sccl(char *, const u_char *);
+
+int
+vsscanf(const char *inp, char const *fmt0, va_list ap)
+{
+	int inr;
+	const u_char *fmt = (const u_char *)fmt0;
+	int c;			/* character from format, or conversion */
+	size_t width;		/* field width, or 0 */
+	char *p;		/* points into all kinds of strings */
+	int n;			/* handy integer */
+	int flags;		/* flags as defined above */
+	char *p0;		/* saves original value of p when necessary */
+	int nassigned;		/* number of fields assigned */
+	int nconversions;	/* number of conversions */
+	int nread;		/* number of characters consumed from fp */
+	int base;		/* base argument to strtoq/strtouq */
+	ccfntype ccfn;		/* conversion function (strtoq/strtouq) */
+	char ccltab[256];	/* character class table for %[...] */
+	char buf[BUF];		/* buffer for numeric conversions */
+
+	/* `basefix' is used to avoid `if' tests in the integer scanner */
+	static short basefix[17] =
+		{ 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+
+	inr = strlen(inp);
+	
+	nassigned = 0;
+	nconversions = 0;
+	nread = 0;
+	base = 0;		/* XXX just to keep gcc happy */
+	ccfn = NULL;		/* XXX just to keep gcc happy */
+	for (;;) {
+		c = *fmt++;
+		if (c == 0)
+			return (nassigned);
+		if (isspace(c)) {
+			while (inr > 0 && isspace(*inp))
+				nread++, inr--, inp++;
+			continue;
+		}
+		if (c != '%')
+			goto literal;
+		width = 0;
+		flags = 0;
+		/*
+		 * switch on the format.  continue if done;
+		 * break once format type is derived.
+		 */
+again:		c = *fmt++;
+		switch (c) {
+		case '%':
+literal:
+			if (inr <= 0)
+				goto input_failure;
+			if (*inp != c)
+				goto match_failure;
+			inr--, inp++;
+			nread++;
+			continue;
+
+		case '*':
+			flags |= SUPPRESS;
+			goto again;
+		case 'l':
+			flags |= LONG;
+			goto again;
+		case 'q':
+			flags |= QUAD;
+			goto again;
+		case 'h':
+			flags |= SHORT;
+			goto again;
+
+		case '0': case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+			width = width * 10 + c - '0';
+			goto again;
+
+		/*
+		 * Conversions.
+		 *
+		 */
+		case 'd':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 10;
+			break;
+
+		case 'i':
+			c = CT_INT;
+			ccfn = (ccfntype)strtoq;
+			base = 0;
+			break;
+
+		case 'o':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 8;
+			break;
+
+		case 'u':
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 10;
+			break;
+
+		case 'x':
+			flags |= PFXOK;	/* enable 0x prefixing */
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 's':
+			c = CT_STRING;
+			break;
+
+		case '[':
+			fmt = __sccl(ccltab, fmt);
+			flags |= NOSKIP;
+			c = CT_CCL;
+			break;
+
+		case 'c':
+			flags |= NOSKIP;
+			c = CT_CHAR;
+			break;
+
+		case 'p':	/* pointer format is like hex */
+			flags |= POINTER | PFXOK;
+			c = CT_INT;
+			ccfn = strtouq;
+			base = 16;
+			break;
+
+		case 'n':
+			nconversions++;
+			if (flags & SUPPRESS)	/* ??? */
+				continue;
+			if (flags & SHORT)
+				*va_arg(ap, short *) = nread;
+			else if (flags & LONG)
+				*va_arg(ap, long *) = nread;
+			else if (flags & QUAD)
+				*va_arg(ap, quad_t *) = nread;
+			else
+				*va_arg(ap, int *) = nread;
+			continue;
+		}
+
+		/*
+		 * We have a conversion that requires input.
+		 */
+		if (inr <= 0)
+			goto input_failure;
+
+		/*
+		 * Consume leading white space, except for formats
+		 * that suppress this.
+		 */
+		if ((flags & NOSKIP) == 0) {
+			while (isspace(*inp)) {
+				nread++;
+				if (--inr > 0)
+					inp++;
+				else 
+					goto input_failure;
+			}
+			/*
+			 * Note that there is at least one character in
+			 * the buffer, so conversions that do not set NOSKIP
+			 * can no longer result in an input failure.
+			 */
+		}
+
+		/*
+		 * Do the conversion.
+		 */
+		switch (c) {
+
+		case CT_CHAR:
+			/* scan arbitrary characters (sets NOSKIP) */
+			if (width == 0)
+				width = 1;
+			if (flags & SUPPRESS) {
+				size_t sum = 0;
+				for (;;) {
+					if ((n = inr) < width) {
+						sum += n;
+						width -= n;
+						inp += n;
+						if (sum == 0)
+							goto input_failure;
+						break;
+					} else {
+						sum += width;
+						inr -= width;
+						inp += width;
+						break;
+					}
+				}
+				nread += sum;
+			} else {
+				bcopy(inp, va_arg(ap, char *), width);
+				inr -= width;
+				inp += width;
+				nread += width;
+				nassigned++;
+			}
+			nconversions++;
+			break;
+
+		case CT_CCL:
+			/* scan a (nonempty) character class (sets NOSKIP) */
+			if (width == 0)
+				width = (size_t)~0;	/* `infinity' */
+			/* take only those things in the class */
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (ccltab[(unsigned char)*inp]) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (n == 0)
+							goto input_failure;
+						break;
+					}
+				}
+				if (n == 0)
+					goto match_failure;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (ccltab[(unsigned char)*inp]) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0) {
+						if (p == p0)
+							goto input_failure;
+						break;
+					}
+				}
+				n = p - p0;
+				if (n == 0)
+					goto match_failure;
+				*p = 0;
+				nassigned++;
+			}
+			nread += n;
+			nconversions++;
+			break;
+
+		case CT_STRING:
+			/* like CCL, but zero-length string OK, & no NOSKIP */
+			if (width == 0)
+				width = (size_t)~0;
+			if (flags & SUPPRESS) {
+				n = 0;
+				while (!isspace(*inp)) {
+					n++, inr--, inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				nread += n;
+			} else {
+				p0 = p = va_arg(ap, char *);
+				while (!isspace(*inp)) {
+					inr--;
+					*p++ = *inp++;
+					if (--width == 0)
+						break;
+					if (inr <= 0)
+						break;
+				}
+				*p = 0;
+				nread += p - p0;
+				nassigned++;
+			}
+			nconversions++;
+			continue;
+
+		case CT_INT:
+			/* scan an integer as if by strtoq/strtouq */
+#ifdef hardway
+			if (width == 0 || width > sizeof(buf) - 1)
+				width = sizeof(buf) - 1;
+#else
+			/* size_t is unsigned, hence this optimisation */
+			if (--width > sizeof(buf) - 2)
+				width = sizeof(buf) - 2;
+			width++;
+#endif
+			flags |= SIGNOK | NDIGITS | NZDIGITS;
+			for (p = buf; width; width--) {
+				c = *inp;
+				/*
+				 * Switch on the character; `goto ok'
+				 * if we accept it as a part of number.
+				 */
+				switch (c) {
+
+				/*
+				 * The digit 0 is always legal, but is
+				 * special.  For %i conversions, if no
+				 * digits (zero or nonzero) have been
+				 * scanned (only signs), we will have
+				 * base==0.  In that case, we should set
+				 * it to 8 and enable 0x prefixing.
+				 * Also, if we have not scanned zero digits
+				 * before this, do not turn off prefixing
+				 * (someone else will turn it off if we
+				 * have scanned any nonzero digits).
+				 */
+				case '0':
+					if (base == 0) {
+						base = 8;
+						flags |= PFXOK;
+					}
+					if (flags & NZDIGITS)
+					    flags &= ~(SIGNOK|NZDIGITS|NDIGITS);
+					else
+					    flags &= ~(SIGNOK|PFXOK|NDIGITS);
+					goto ok;
+
+				/* 1 through 7 always legal */
+				case '1': case '2': case '3':
+				case '4': case '5': case '6': case '7':
+					base = basefix[base];
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* digits 8 and 9 ok iff decimal or hex */
+				case '8': case '9':
+					base = basefix[base];
+					if (base <= 8)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* letters ok iff hex */
+				case 'A': case 'B': case 'C':
+				case 'D': case 'E': case 'F':
+				case 'a': case 'b': case 'c':
+				case 'd': case 'e': case 'f':
+					/* no need to fix base here */
+					if (base <= 10)
+						break;	/* not legal here */
+					flags &= ~(SIGNOK | PFXOK | NDIGITS);
+					goto ok;
+
+				/* sign ok only as first character */
+				case '+': case '-':
+					if (flags & SIGNOK) {
+						flags &= ~SIGNOK;
+						goto ok;
+					}
+					break;
+
+				/* x ok iff flag still set & 2nd char */
+				case 'x': case 'X':
+					if (flags & PFXOK && p == buf + 1) {
+						base = 16;	/* if %i */
+						flags &= ~PFXOK;
+						goto ok;
+					}
+					break;
+				}
+
+				/*
+				 * If we got here, c is not a legal character
+				 * for a number.  Stop accumulating digits.
+				 */
+				break;
+		ok:
+				/*
+				 * c is legal: store it and look at the next.
+				 */
+				*p++ = c;
+				if (--inr > 0)
+					inp++;
+				else 
+					break;		/* end of input */
+			}
+			/*
+			 * If we had only a sign, it is no good; push
+			 * back the sign.  If the number ends in `x',
+			 * it was [sign] '0' 'x', so push back the x
+			 * and treat it as [sign] '0'.
+			 */
+			if (flags & NDIGITS) {
+				if (p > buf) {
+					inp--;
+					inr++;
+				}
+				goto match_failure;
+			}
+			c = ((u_char *)p)[-1];
+			if (c == 'x' || c == 'X') {
+				--p;
+				inp--;
+				inr++;
+			}
+			if ((flags & SUPPRESS) == 0) {
+				u_quad_t res;
+
+				*p = 0;
+				res = (*ccfn)(buf, (char **)NULL, base);
+				if (flags & POINTER)
+					*va_arg(ap, void **) =
+						(void *)(uintptr_t)res;
+				else if (flags & SHORT)
+					*va_arg(ap, short *) = res;
+				else if (flags & LONG)
+					*va_arg(ap, long *) = res;
+				else if (flags & QUAD)
+					*va_arg(ap, quad_t *) = res;
+				else
+					*va_arg(ap, int *) = res;
+				nassigned++;
+			}
+			nread += p - buf;
+			nconversions++;
+			break;
+
+		}
+	}
+input_failure:
+	return (nconversions != 0 ? nassigned : -1);
+match_failure:
+	return (nassigned);
+}
+
+
+/*
+ * Fill in the given table from the scanset at the given format
+ * (just after `[').  Return a pointer to the character past the
+ * closing `]'.  The table has a 1 wherever characters should be
+ * considered part of the scanset.
+ */
+static const u_char *
+__sccl(char *tab, const u_char *fmt)
+{
+	int c, n, v;
+
+	/* first `clear' the whole table */
+	c = *fmt++;		/* first char hat => negated scanset */
+	if (c == '^') {
+		v = 1;		/* default => accept */
+		c = *fmt++;	/* get new first char */
+	} else
+		v = 0;		/* default => reject */
+
+	/* XXX: Will not work if sizeof(tab*) > sizeof(char) */
+	for (n = 0; n < 256; n++)
+		     tab[n] = v;	/* memset(tab, v, 256) */
+
+	if (c == 0)
+		return (fmt - 1);/* format ended before closing ] */
+
+	/*
+	 * Now set the entries corresponding to the actual scanset
+	 * to the opposite of the above.
+	 *
+	 * The first character may be ']' (or '-') without being special;
+	 * the last character may be '-'.
+	 */
+	v = 1 - v;
+	for (;;) {
+		tab[c] = v;		/* take character c */
+doswitch:
+		n = *fmt++;		/* and examine the next */
+		switch (n) {
+
+		case 0:			/* format ended too soon */
+			return (fmt - 1);
+
+		case '-':
+			/*
+			 * A scanset of the form
+			 *	[01+-]
+			 * is defined as `the digit 0, the digit 1,
+			 * the character +, the character -', but
+			 * the effect of a scanset such as
+			 *	[a-zA-Z0-9]
+			 * is implementation defined.  The V7 Unix
+			 * scanf treats `a-z' as `the letters a through
+			 * z', but treats `a-a' as `the letter a, the
+			 * character -, and the letter a'.
+			 *
+			 * For compatibility, the `-' is not considerd
+			 * to define a range if the character following
+			 * it is either a close bracket (required by ANSI)
+			 * or is not numerically greater than the character
+			 * we just stored in the table (c).
+			 */
+			n = *fmt;
+			if (n == ']' || n < c) {
+				c = '-';
+				break;	/* resume the for(;;) */
+			}
+			fmt++;
+			/* fill in the range */
+			do {
+			    tab[++c] = v;
+			} while (c < n);
+			c = n;
+			/*
+			 * Alas, the V7 Unix scanf also treats formats
+			 * such as [a-c-e] as `the letters a through e'.
+			 * This too is permitted by the standard....
+			 */
+			goto doswitch;
+			break;
+
+		case ']':		/* end of scanset */
+			return (fmt);
+
+		default:		/* just another character */
+			c = n;
+			break;
+		}
+	}
+	/* NOTREACHED */
+}
+
+/*
+int 
+sscanf(const char *ibuf, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	
+	va_start(ap, fmt);
+	ret = vsscanf(ibuf, fmt, ap);
+	va_end(ap);
+	
+	return(ret);
+}
+*/
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/thirdparty/thekla_atlas/nvimage/BitMap.cpp b/thirdparty/thekla_atlas/nvimage/BitMap.cpp
new file mode 100644
index 0000000000..8cc49644ea
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvimage/BitMap.cpp
@@ -0,0 +1,27 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "BitMap.h"
+
+using namespace nv;
+
+void BitMap::resize(uint w, uint h, bool initValue)
+{
+    BitArray tmp(w*h);
+
+    if (initValue) tmp.setAll();
+    else tmp.clearAll();
+
+    // @@ Copying one bit at a time. This could be much faster.
+    for (uint y = 0; y < m_height; y++)
+    {
+        for (uint x = 0; x < m_width; x++)
+        {
+            //tmp.setBitAt(y*w + x, bitAt(x, y));
+            if (bitAt(x, y) != initValue) tmp.toggleBitAt(y*w + x);
+        }
+    }
+
+    swap(m_bitArray, tmp);
+    m_width = w;
+    m_height = h;
+}
diff --git a/thirdparty/thekla_atlas/nvimage/BitMap.h b/thirdparty/thekla_atlas/nvimage/BitMap.h
new file mode 100644
index 0000000000..a285321176
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvimage/BitMap.h
@@ -0,0 +1,87 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_BITMAP_H
+#define NV_IMAGE_BITMAP_H
+
+#include "nvimage.h"
+
+#include "nvcore/BitArray.h"
+
+namespace nv 
+{
+    /// Bit map. This should probably be called BitImage.
+    class NVIMAGE_CLASS BitMap
+    {
+    public:
+        BitMap() : m_width(0), m_height(0) {}
+        BitMap(uint w, uint h) : m_width(w), m_height(h), m_bitArray(w*h) {}
+
+        uint width() const { return m_width; }
+        uint height() const { return m_height; }
+
+        void resize(uint w, uint h, bool initValue);
+
+        bool bitAt(uint x, uint y) const
+        {
+            nvDebugCheck(x < m_width && y < m_height);
+            return m_bitArray.bitAt(y * m_width + x);
+        }
+        bool bitAt(uint idx) const
+        {
+            return m_bitArray.bitAt(idx);
+        }
+
+        void setBitAt(uint x, uint y)
+        {
+            nvDebugCheck(x < m_width && y < m_height);
+            m_bitArray.setBitAt(y * m_width + x);
+        }
+        void setBitAt(uint idx)
+        {
+            m_bitArray.setBitAt(idx);
+        }
+
+        void clearBitAt(uint x, uint y)
+        {
+            nvDebugCheck(x < m_width && y < m_height);
+            m_bitArray.clearBitAt(y * m_width + x);
+        }
+        void clearBitAt(uint idx)
+        {
+            m_bitArray.clearBitAt(idx);
+        }
+
+        void clearAll()
+        {
+            m_bitArray.clearAll();
+        }
+
+        void setAll()
+        {
+            m_bitArray.setAll();
+        }
+
+        void toggleAll()
+        {
+            m_bitArray.toggleAll();
+        }
+
+        friend void swap(BitMap & a, BitMap & b)
+        {
+            nvCheck(a.m_width == b.m_width);
+            nvCheck(a.m_height == b.m_height);
+            swap(a.m_bitArray, b.m_bitArray);
+        }
+
+    private:
+
+        uint m_width;
+        uint m_height;
+        BitArray m_bitArray;
+
+    };
+
+} // nv namespace
+
+#endif // NV_IMAGE_BITMAP_H
diff --git a/thirdparty/thekla_atlas/nvimage/Image.cpp b/thirdparty/thekla_atlas/nvimage/Image.cpp
new file mode 100644
index 0000000000..8c0cbcf4e3
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvimage/Image.cpp
@@ -0,0 +1,210 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Image.h"
+//#include "ImageIO.h"
+
+#include "nvmath/Color.h"
+
+#include "nvcore/Debug.h"
+#include "nvcore/Ptr.h"
+#include "nvcore/Utils.h" // swap
+#include "nvcore/Memory.h" // realloc, free
+
+#include <string.h> // memcpy
+
+
+using namespace nv;
+
+Image::Image() : m_width(0), m_height(0), m_format(Format_RGB), m_data(NULL)
+{
+}
+
+Image::Image(const Image & img) : m_data(NULL)
+{
+	allocate(img.m_width, img.m_height, img.m_depth);
+    m_format = img.m_format;
+    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
+}
+
+Image::~Image()
+{
+    free();
+}
+
+const Image & Image::operator=(const Image & img)
+{
+    allocate(img.m_width, img.m_height, m_depth);
+    m_format = img.m_format;
+    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
+    return *this;
+}
+
+
+void Image::allocate(uint w, uint h, uint d/*= 1*/)
+{
+    m_width = w;
+    m_height = h;
+	m_depth = d;
+    m_data = realloc<Color32>(m_data, w * h * d);
+}
+
+void Image::acquire(Color32 * data, uint w, uint h, uint d/*= 1*/)
+{
+    free();
+    m_width = w;
+    m_height = h;
+    m_depth = d;
+    m_data = data;
+}
+
+void Image::resize(uint w, uint h, uint d/*= 1*/) {
+
+    Image img;
+    img.allocate(w, h, d);
+
+    Color32 background(0,0,0,0);
+
+    // Copy image.
+    uint x, y, z;
+    for(z = 0; z < min(d, m_depth); z++) {
+        for(y = 0; y < min(h, m_height); y++) {
+            for(x = 0; x < min(w, m_width); x++) {
+                img.pixel(x, y, z) = pixel(x, y, z);
+            }
+            for(; x < w; x++) {
+                img.pixel(x, y, z) = background;
+            }
+        }
+        for(; y < h; y++) {
+            for(x = 0; x < w; x++) {
+                img.pixel(x, y, z) = background;
+            }
+        }
+    }
+    for(; z < d; z++) {
+        for(y = 0; y < h; y++) {
+            for(x = 0; x < w; x++) {
+                img.pixel(x, y, z) = background;
+            }
+        }
+    }
+
+    swap(m_width, img.m_width);
+    swap(m_height, img.m_height);
+	swap(m_depth, img.m_depth);
+    swap(m_format, img.m_format);
+    swap(m_data, img.m_data);
+}
+
+/*bool Image::load(const char * name)
+{
+    free();
+
+    AutoPtr<Image> img(ImageIO::load(name));
+    if (img == NULL) {
+        return false;
+    }
+
+    swap(m_width, img->m_width);
+    swap(m_height, img->m_height);
+	swap(m_depth, img->m_depth);
+    swap(m_format, img->m_format);
+    swap(m_data, img->m_data);
+
+    return true;
+}*/
+
+void Image::wrap(void * data, uint w, uint h, uint d)
+{
+    free();
+    m_data = (Color32 *)data;
+    m_width = w;
+    m_height = h;
+	m_depth = d;
+}
+
+void Image::unwrap()
+{
+    m_data = NULL;
+    m_width = 0;
+    m_height = 0;
+	m_depth = 0;
+}
+
+
+void Image::free()
+{
+    ::free(m_data);
+    m_data = NULL;
+}
+
+
+uint Image::width() const
+{
+    return m_width;
+}
+
+uint Image::height() const
+{
+    return m_height;
+}
+
+uint Image::depth() const
+{
+	return m_depth;
+}
+
+const Color32 * Image::scanline(uint h) const
+{
+    nvDebugCheck(h < m_height);
+    return m_data + h * m_width;
+}
+
+Color32 * Image::scanline(uint h)
+{
+    nvDebugCheck(h < m_height);
+    return m_data + h * m_width;
+}
+
+const Color32 * Image::pixels() const
+{
+    return m_data;
+}
+
+Color32 * Image::pixels()
+{
+    return m_data;
+}
+
+const Color32 & Image::pixel(uint idx) const
+{
+    nvDebugCheck(idx < m_width * m_height * m_depth);
+    return m_data[idx];
+}
+
+Color32 & Image::pixel(uint idx)
+{
+    nvDebugCheck(idx < m_width * m_height * m_depth);
+    return m_data[idx];
+}
+
+
+Image::Format Image::format() const
+{
+    return m_format;
+}
+
+void Image::setFormat(Image::Format f)
+{
+    m_format = f;
+}
+
+void Image::fill(Color32 c)
+{
+    const uint size = m_width * m_height * m_depth;
+    for (uint i = 0; i < size; ++i)
+    {
+        m_data[i] = c;
+    }
+}
+
diff --git a/thirdparty/thekla_atlas/nvimage/Image.h b/thirdparty/thekla_atlas/nvimage/Image.h
new file mode 100644
index 0000000000..4c5748cb00
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvimage/Image.h
@@ -0,0 +1,89 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_IMAGE_H
+#define NV_IMAGE_IMAGE_H
+
+#include "nvimage.h"
+#include "nvcore/Debug.h"
+
+namespace nv
+{
+    class Color32;
+
+    /// 32 bit RGBA image.
+    class NVIMAGE_CLASS Image
+    {
+    public:
+
+        enum Format 
+        {
+            Format_RGB,
+            Format_ARGB,
+        };
+
+        Image();
+        Image(const Image & img);
+        ~Image();
+
+        const Image & operator=(const Image & img);
+
+
+        void allocate(uint w, uint h, uint d = 1);
+        void acquire(Color32 * data, uint w, uint h, uint d = 1);
+        //bool load(const char * name);
+
+        void resize(uint w, uint h, uint d = 1);
+
+        void wrap(void * data, uint w, uint h, uint d = 1);
+        void unwrap();
+
+        uint width() const;
+        uint height() const;
+        uint depth() const;
+
+        const Color32 * scanline(uint h) const;
+        Color32 * scanline(uint h);
+
+        const Color32 * pixels() const;
+        Color32 * pixels();
+
+        const Color32 & pixel(uint idx) const;
+        Color32 & pixel(uint idx);
+
+        const Color32 & pixel(uint x, uint y, uint z = 0) const;
+        Color32 & pixel(uint x, uint y,  uint z = 0);
+
+        Format format() const;
+        void setFormat(Format f);
+
+        void fill(Color32 c);
+
+    private:
+        void free();
+
+    private:
+        uint m_width;
+        uint m_height;
+        uint m_depth;
+        Format m_format;
+        Color32 * m_data;
+    };
+
+
+    inline const Color32 & Image::pixel(uint x, uint y, uint z) const
+    {
+        nvDebugCheck(x < m_width && y < m_height && z < m_depth);
+        return pixel((z * m_height + y) * m_width + x);
+    }
+
+    inline Color32 & Image::pixel(uint x, uint y, uint z)
+    {
+        nvDebugCheck(x < m_width && y < m_height && z < m_depth);
+        return pixel((z * m_height + y) * m_width + x);
+    }
+
+} // nv namespace
+
+
+#endif // NV_IMAGE_IMAGE_H
diff --git a/thirdparty/thekla_atlas/nvimage/nvimage.h b/thirdparty/thekla_atlas/nvimage/nvimage.h
new file mode 100644
index 0000000000..5c89bd4726
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvimage/nvimage.h
@@ -0,0 +1,48 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_IMAGE_H
+#define NV_IMAGE_H
+
+#include "nvcore/nvcore.h"
+#include "nvcore/Debug.h" // nvDebugCheck
+#include "nvcore/Utils.h" // isPowerOfTwo
+
+// Function linkage
+#if NVIMAGE_SHARED
+#ifdef NVIMAGE_EXPORTS
+#define NVIMAGE_API DLL_EXPORT
+#define NVIMAGE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVIMAGE_API DLL_IMPORT
+#define NVIMAGE_CLASS DLL_IMPORT
+#endif
+#else
+#define NVIMAGE_API
+#define NVIMAGE_CLASS
+#endif
+
+
+namespace nv {
+
+    // Some utility functions:
+
+    inline uint computeBitPitch(uint w, uint bitsize, uint alignmentInBits)
+    {
+        nvDebugCheck(isPowerOfTwo(alignmentInBits));
+
+        return ((w * bitsize +  alignmentInBits - 1) / alignmentInBits) * alignmentInBits;
+    }
+
+    inline uint computeBytePitch(uint w, uint bitsize, uint alignmentInBytes)
+    {
+        uint pitch = computeBitPitch(w, bitsize, 8*alignmentInBytes);
+        nvDebugCheck((pitch & 7) == 0);
+
+        return (pitch + 7) / 8;
+    }
+
+
+} // nv namespace
+
+#endif // NV_IMAGE_H
diff --git a/thirdparty/thekla_atlas/nvmath/Basis.cpp b/thirdparty/thekla_atlas/nvmath/Basis.cpp
new file mode 100644
index 0000000000..0824179633
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Basis.cpp
@@ -0,0 +1,270 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "Basis.h"
+
+using namespace nv;
+
+
+/// Normalize basis vectors.
+void Basis::normalize(float epsilon /*= NV_EPSILON*/)
+{
+    normal = ::normalizeSafe(normal, Vector3(0.0f), epsilon);
+    tangent = ::normalizeSafe(tangent, Vector3(0.0f), epsilon);
+    bitangent = ::normalizeSafe(bitangent, Vector3(0.0f), epsilon);
+}
+
+
+/// Gram-Schmidt orthogonalization.
+/// @note Works only if the vectors are close to orthogonal.
+void Basis::orthonormalize(float epsilon /*= NV_EPSILON*/)
+{
+    // N' = |N|
+    // T' = |T - (N' dot T) N'|
+    // B' = |B - (N' dot B) N' - (T' dot B) T'|
+
+    normal = ::normalize(normal, epsilon);
+
+    tangent -= normal * dot(normal, tangent);
+    tangent = ::normalize(tangent, epsilon);
+
+    bitangent -= normal * dot(normal, bitangent);
+    bitangent -= tangent * dot(tangent, bitangent);
+    bitangent = ::normalize(bitangent, epsilon);
+}
+
+
+
+
+/// Robust orthonormalization. 
+/// Returns an orthonormal basis even when the original is degenerate.
+void Basis::robustOrthonormalize(float epsilon /*= NV_EPSILON*/)
+{
+    // Normalize all vectors.
+    normalize(epsilon);
+
+    if (lengthSquared(normal) < epsilon*epsilon)
+    {
+        // Build normal from tangent and bitangent.
+        normal = cross(tangent, bitangent);
+
+        if (lengthSquared(normal) < epsilon*epsilon)
+        {
+            // Arbitrary basis.
+            tangent   = Vector3(1, 0, 0);
+            bitangent = Vector3(0, 1, 0);
+            normal    = Vector3(0, 0, 1);
+            return;
+        }
+
+        normal = nv::normalize(normal, epsilon);
+    }
+
+    // Project tangents to normal plane.
+    tangent -= normal * dot(normal, tangent);
+    bitangent -= normal * dot(normal, bitangent);
+
+    if (lengthSquared(tangent) < epsilon*epsilon)
+    {
+        if (lengthSquared(bitangent) < epsilon*epsilon)
+        {
+            // Arbitrary basis.
+            buildFrameForDirection(normal);
+        }
+        else
+        {
+            // Build tangent from bitangent.
+            bitangent = nv::normalize(bitangent, epsilon);
+
+            tangent = cross(bitangent, normal);
+            nvDebugCheck(isNormalized(tangent, epsilon));
+        }
+    }
+    else
+    {
+        tangent = nv::normalize(tangent, epsilon);
+#if 0
+        bitangent -= tangent * dot(tangent, bitangent);
+
+        if (lengthSquared(bitangent) < epsilon*epsilon)
+        {
+            bitangent = cross(tangent, normal);
+            nvDebugCheck(isNormalized(bitangent, epsilon));
+        }
+        else
+        {
+            bitangent = nv::normalize(bitangent, epsilon);
+        }
+#else
+        if (lengthSquared(bitangent) < epsilon*epsilon)
+        {
+            // Build bitangent from tangent.
+            bitangent = cross(tangent, normal);
+            nvDebugCheck(isNormalized(bitangent, epsilon));
+        }
+        else
+        {
+            bitangent = nv::normalize(bitangent, epsilon);
+
+            // At this point tangent and bitangent are orthogonal to normal, but we don't know whether their orientation.
+            
+            Vector3 bisector;
+            if (lengthSquared(tangent + bitangent) < epsilon*epsilon)
+            {
+                bisector = tangent;
+            }
+            else
+            {
+                bisector = nv::normalize(tangent + bitangent);
+            }
+            Vector3 axis = nv::normalize(cross(bisector, normal));
+
+            //nvDebugCheck(isNormalized(axis, epsilon));
+            nvDebugCheck(equal(dot(axis, tangent), -dot(axis, bitangent), epsilon));
+
+            if (dot(axis, tangent) > 0)
+            {
+                tangent = bisector + axis;
+                bitangent = bisector - axis;
+            }
+            else
+            {
+                tangent = bisector - axis;
+                bitangent = bisector + axis;
+            }
+
+            // Make sure the resulting tangents are still perpendicular to the normal.
+            tangent -= normal * dot(normal, tangent);
+            bitangent -= normal * dot(normal, bitangent);
+
+            // Double check.
+            nvDebugCheck(equal(dot(normal, tangent), 0.0f, epsilon));
+            nvDebugCheck(equal(dot(normal, bitangent), 0.0f, epsilon));
+
+            // Normalize.
+            tangent = nv::normalize(tangent);
+            bitangent = nv::normalize(bitangent);
+
+            // If tangent and bitangent are not orthogonal, then derive bitangent from tangent, just in case...
+            if (!equal(dot(tangent, bitangent), 0.0f, epsilon)) {
+                bitangent = cross(tangent, normal);
+                bitangent = nv::normalize(bitangent);
+            }
+        }
+#endif
+    }
+
+    /*// Check vector lengths.
+    if (!isNormalized(normal, epsilon))
+    {
+    nvDebug("%f %f %f\n", normal.x, normal.y, normal.z);
+    nvDebug("%f %f %f\n", tangent.x, tangent.y, tangent.z);
+    nvDebug("%f %f %f\n", bitangent.x, bitangent.y, bitangent.z);
+    }*/
+
+    nvDebugCheck(isNormalized(normal, epsilon));
+    nvDebugCheck(isNormalized(tangent, epsilon));
+    nvDebugCheck(isNormalized(bitangent, epsilon));
+
+    // Check vector angles.
+    nvDebugCheck(equal(dot(normal, tangent), 0.0f, epsilon));
+    nvDebugCheck(equal(dot(normal, bitangent), 0.0f, epsilon));
+    nvDebugCheck(equal(dot(tangent, bitangent), 0.0f, epsilon));
+
+    // Check vector orientation.
+    const float det = dot(cross(normal, tangent), bitangent);
+    nvDebugCheck(equal(det, 1.0f, epsilon) || equal(det, -1.0f, epsilon));
+}
+
+
+/// Build an arbitrary frame for the given direction.
+void Basis::buildFrameForDirection(Vector3::Arg d, float angle/*= 0*/)
+{
+    nvCheck(isNormalized(d));
+    normal = d;
+
+    // Choose minimum axis.
+    if (fabsf(normal.x) < fabsf(normal.y) && fabsf(normal.x) < fabsf(normal.z))
+    {
+        tangent = Vector3(1, 0, 0);
+    }
+    else if (fabsf(normal.y) < fabsf(normal.z))
+    {
+        tangent = Vector3(0, 1, 0);
+    }
+    else
+    {
+        tangent = Vector3(0, 0, 1);
+    }
+
+    // Ortogonalize
+    tangent -= normal * dot(normal, tangent);
+    tangent = ::normalize(tangent);
+
+    bitangent = cross(normal, tangent);
+
+    // Rotate frame around normal according to angle.
+    if (angle != 0.0f) {
+        float c = cosf(angle);
+        float s = sinf(angle);
+        Vector3 tmp = c * tangent - s * bitangent;
+        bitangent = s * tangent + c * bitangent;
+        tangent = tmp;
+    }
+}
+
+bool Basis::isValid() const
+{
+    if (equal(normal, Vector3(0.0f))) return false;
+    if (equal(tangent, Vector3(0.0f))) return false;
+    if (equal(bitangent, Vector3(0.0f))) return false;
+
+    if (equal(determinant(), 0.0f)) return false;
+
+    return true;
+}
+
+
+/// Transform by this basis. (From this basis to object space).
+Vector3 Basis::transform(Vector3::Arg v) const
+{
+    Vector3 o = tangent * v.x;
+    o += bitangent * v.y;
+    o += normal * v.z;
+    return o;
+}
+
+/// Transform by the transpose. (From object space to this basis).
+Vector3 Basis::transformT(Vector3::Arg v)
+{
+    return Vector3(dot(tangent, v), dot(bitangent, v), dot(normal, v));
+}
+
+/// Transform by the inverse. (From object space to this basis).
+/// @note Uses Cramer's rule so the inverse is not accurate if the basis is ill-conditioned.
+Vector3 Basis::transformI(Vector3::Arg v) const
+{
+    const float det = determinant();
+    nvDebugCheck(!equal(det, 0.0f, 0.0f));
+
+    const float idet = 1.0f / det;
+
+    // Rows of the inverse matrix.
+    Vector3 r0(
+        (bitangent.y * normal.z - bitangent.z * normal.y),
+        -(bitangent.x * normal.z - bitangent.z * normal.x),
+        (bitangent.x * normal.y - bitangent.y * normal.x));
+
+    Vector3 r1(
+        -(tangent.y * normal.z - tangent.z * normal.y),
+        (tangent.x * normal.z - tangent.z * normal.x),
+        -(tangent.x * normal.y - tangent.y * normal.x));
+
+    Vector3 r2(
+        (tangent.y * bitangent.z - tangent.z * bitangent.y),
+        -(tangent.x * bitangent.z - tangent.z * bitangent.x),
+        (tangent.x * bitangent.y - tangent.y * bitangent.x));
+
+    return Vector3(dot(v, r0), dot(v, r1), dot(v, r2)) * idet;
+}
+
+
diff --git a/thirdparty/thekla_atlas/nvmath/Basis.h b/thirdparty/thekla_atlas/nvmath/Basis.h
new file mode 100644
index 0000000000..e8146afdbe
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Basis.h
@@ -0,0 +1,82 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_BASIS_H
+#define NV_MATH_BASIS_H
+
+#include "nvmath.h"
+#include "Vector.inl"
+#include "Matrix.h"
+
+namespace nv
+{
+
+    /// Basis class to compute tangent space basis, ortogonalizations and to
+    /// transform vectors from one space to another.
+    class Basis
+    {
+    public:
+
+        /// Create a null basis.
+        Basis() : tangent(0, 0, 0), bitangent(0, 0, 0), normal(0, 0, 0) {}
+
+        /// Create a basis given three vectors.
+        Basis(Vector3::Arg n, Vector3::Arg t, Vector3::Arg b) : tangent(t), bitangent(b), normal(n) {}
+
+        /// Create a basis with the given tangent vectors and the handness.
+        Basis(Vector3::Arg n, Vector3::Arg t, float sign)
+        {
+            build(n, t, sign);
+        }
+
+        NVMATH_API void normalize(float epsilon = NV_EPSILON);
+        NVMATH_API void orthonormalize(float epsilon = NV_EPSILON);
+        NVMATH_API void robustOrthonormalize(float epsilon = NV_EPSILON);
+        NVMATH_API void buildFrameForDirection(Vector3::Arg d, float angle = 0);
+
+        /// Calculate the determinant [ F G N ] to obtain the handness of the basis. 
+        float handness() const
+        {
+            return determinant() > 0.0f ? 1.0f : -1.0f;
+        }
+
+        /// Build a basis from 2 vectors and a handness flag.
+        void build(Vector3::Arg n, Vector3::Arg t, float sign)
+        {
+            normal = n;
+            tangent = t;
+            bitangent = sign * cross(t, n);
+        }
+
+        /// Compute the determinant of this basis.
+        float determinant() const
+        {
+            return 
+                tangent.x * bitangent.y * normal.z - tangent.z * bitangent.y * normal.x +
+                tangent.y * bitangent.z * normal.x - tangent.y * bitangent.x * normal.z + 
+                tangent.z * bitangent.x * normal.y - tangent.x * bitangent.z * normal.y;
+        }
+
+        bool isValid() const;
+
+        // Get transform matrix for this basis.
+        NVMATH_API Matrix matrix() const;
+
+        // Transform by this basis. (From this basis to object space).
+        NVMATH_API Vector3 transform(Vector3::Arg v) const;
+
+        // Transform by the transpose. (From object space to this basis).
+        NVMATH_API Vector3 transformT(Vector3::Arg v);
+
+        // Transform by the inverse. (From object space to this basis).
+        NVMATH_API Vector3 transformI(Vector3::Arg v) const;
+
+
+        Vector3 tangent;
+        Vector3 bitangent;
+        Vector3 normal;
+    };
+
+} // nv namespace
+
+#endif // NV_MATH_BASIS_H
diff --git a/thirdparty/thekla_atlas/nvmath/Box.cpp b/thirdparty/thekla_atlas/nvmath/Box.cpp
new file mode 100644
index 0000000000..8f2014a077
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Box.cpp
@@ -0,0 +1,119 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Box.h"
+#include "Box.inl"
+#include "Sphere.h"
+
+using namespace nv;
+
+
+
+
+// Clip the given segment against this box.
+bool Box::clipSegment(const Vector3 & origin, const Vector3 & dir, float * t_near, float * t_far) const {
+
+	// Avoid aliasing.
+	float tnear = *t_near;
+	float tfar = *t_far;
+
+	// clip ray segment to box
+	for (int i = 0; i < 3; i++)
+	{
+		const float pos = origin.component[i] + tfar * dir.component[i];
+		const float dt = tfar - tnear;
+
+		if (dir.component[i] < 0) {
+			
+			// clip end point
+			if (pos < minCorner.component[i]) {
+                tfar = tnear + dt * (origin.component[i] - minCorner.component[i]) / (origin.component[i] - pos);
+			}
+			
+			// clip start point
+			if (origin.component[i] > maxCorner.component[i]) {
+				tnear = tnear + dt * (origin.component[i] - maxCorner.component[i]) / (tfar * dir.component[i]);
+			}
+		}
+		else {
+
+			// clip end point
+			if (pos > maxCorner.component[i]) {
+				tfar = tnear + dt * (maxCorner.component[i] - origin.component[i]) / (pos - origin.component[i]);
+			}
+
+			// clip start point
+			if (origin.component[i] < minCorner.component[i]) {
+				tnear = tnear + dt * (minCorner.component[i] - origin.component[i]) / (tfar * dir.component[i]);
+			}
+		}
+
+		if (tnear > tfar) {
+			// Clipped away.
+			return false;
+		}
+	}
+
+	// Return result.
+	*t_near = tnear;
+	*t_far = tfar;
+	return true;
+}
+
+
+float nv::distanceSquared(const Box &box, const Vector3 &point) {
+    Vector3 closest;
+
+    if (point.x < box.minCorner.x) closest.x = box.minCorner.x;
+    else if (point.x > box.maxCorner.x) closest.x = box.maxCorner.x;
+    else closest.x = point.x;
+
+    if (point.y < box.minCorner.y) closest.y = box.minCorner.y;
+    else if (point.y > box.maxCorner.y) closest.y = box.maxCorner.y;
+    else closest.y = point.y;
+
+    if (point.z < box.minCorner.z) closest.z = box.minCorner.z;
+    else if (point.z > box.maxCorner.z) closest.z = box.maxCorner.z;
+    else closest.z = point.z;
+
+    return lengthSquared(point - closest);
+}
+
+bool nv::overlap(const Box &box, const Sphere &sphere) {
+    return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
+}
+
+
+bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) {
+    // Precompute these in ray structure?
+    int sdx = (id.x < 0);
+    int sdy = (id.y < 0);
+    int sdz = (id.z < 0);
+
+    float tmin = (box.corner(  sdx).x - p.x) * id.x;
+    float tmax = (box.corner(1-sdx).x - p.x) * id.x;
+    float tymin = (box.corner(  sdy).y - p.y) * id.y;
+    float tymax = (box.corner(1-sdy).y - p.y) * id.y;
+
+    if ((tmin > tymax) || (tymin > tmax)) 
+        return false;
+
+    if (tymin > tmin) tmin = tymin;
+    if (tymax < tmax) tmax = tymax;
+
+    float tzmin = (box.corner(  sdz).z - p.z) * id.z;
+    float tzmax = (box.corner(1-sdz).z - p.z) * id.z;
+
+    if ((tmin > tzmax) || (tzmin > tmax)) 
+        return false;
+
+    if (tzmin > tmin) tmin = tzmin;
+    if (tzmax < tmax) tmax = tzmax;
+
+    if (tmax < 0) 
+        return false;
+
+    if (t != NULL) *t = tmin;
+
+    return true;
+}
+
diff --git a/thirdparty/thekla_atlas/nvmath/Box.h b/thirdparty/thekla_atlas/nvmath/Box.h
new file mode 100644
index 0000000000..19b5f2a3a5
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Box.h
@@ -0,0 +1,103 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_BOX_H
+#define NV_MATH_BOX_H
+
+#include "Vector.h"
+
+#include <float.h> // FLT_MAX
+
+namespace nv
+{
+    class Vector;
+    class Stream;
+    class Sphere;
+
+    // Axis Aligned Bounding Box.
+    class Box
+    {
+    public:
+
+        inline Box() {}
+        inline Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) {}
+        inline Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) {}
+
+        Box & operator=(const Box & b);
+
+        operator const float * () const { return reinterpret_cast<const float *>(this); }
+
+        // Clear the bounds.
+        void clearBounds();
+
+        // min < max
+        bool isValid() const;
+
+        // Build a cube centered on center and with edge = 2*dist
+        void cube(const Vector3 & center, float dist);
+
+        // Build a box, given center and extents.
+        void setCenterExtents(const Vector3 & center, const Vector3 & extents);
+
+        // Get box center.
+        Vector3 center() const;
+
+        // Return extents of the box.
+        Vector3 extents() const;
+
+        // Return extents of the box.
+        float extents(uint axis) const;
+
+        // Add a point to this box.
+        void addPointToBounds(const Vector3 & p);
+
+        // Add a box to this box.
+        void addBoxToBounds(const Box & b);
+
+        // Add sphere to this box.
+        void addSphereToBounds(const Vector3 & p, float r);
+
+        // Translate box.
+        void translate(const Vector3 & v);
+
+        // Scale the box.
+        void scale(float s);
+
+        // Expand the box by a fixed amount.
+        void expand(float r);
+
+        // Get the area of the box.
+        float area() const;
+ 
+        // Get the volume of the box.
+        float volume() const;
+
+        // Return true if the box contains the given point.
+        bool contains(const Vector3 & p) const;
+
+        // Split the given box in 8 octants and assign the ith one to this box.
+        void setOctant(const Box & box, const Vector3 & center, int i);
+
+
+        // Clip the given segment against this box.
+        bool clipSegment(const Vector3 & origin, const Vector3 & dir, float * t_near, float * t_far) const;
+
+
+        friend Stream & operator<< (Stream & s, Box & box);
+
+        const Vector3 & corner(int i) const { return (&minCorner)[i]; }
+
+        Vector3 minCorner;
+        Vector3 maxCorner;
+    };
+
+    float distanceSquared(const Box &box, const Vector3 &point);
+    bool overlap(const Box &box, const Sphere &sphere);
+
+    // p is ray origin, id is inverse ray direction.
+    bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t);
+
+} // nv namespace
+
+
+#endif // NV_MATH_BOX_H
diff --git a/thirdparty/thekla_atlas/nvmath/Box.inl b/thirdparty/thekla_atlas/nvmath/Box.inl
new file mode 100644
index 0000000000..dcfa70ff96
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Box.inl
@@ -0,0 +1,154 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_BOX_INL
+#define NV_MATH_BOX_INL
+
+#include "Box.h"
+#include "Vector.inl"
+
+#include <float.h> // FLT_MAX
+
+namespace nv
+{
+    // Default ctor.
+    //inline Box::Box() { };
+
+    // Copy ctor.
+    //inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
+
+    // Init ctor.
+    //inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
+
+    // Assignment operator.
+    inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
+
+    // Clear the bounds.
+    inline void Box::clearBounds()
+    {
+        minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
+        maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+    }
+
+    // min < max
+    inline bool Box::isValid() const
+    {
+        return minCorner.x <= maxCorner.x && minCorner.y <= maxCorner.y && minCorner.z <= maxCorner.z;
+    }
+
+    // Build a cube centered on center and with edge = 2*dist
+    inline void Box::cube(const Vector3 & center, float dist)
+    {
+        setCenterExtents(center, Vector3(dist));
+    }
+
+    // Build a box, given center and extents.
+    inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
+    {
+        minCorner = center - extents;
+        maxCorner = center + extents;
+    }
+
+    // Get box center.
+    inline Vector3 Box::center() const
+    {
+        return (minCorner + maxCorner) * 0.5f;
+    }
+
+    // Return extents of the box.
+    inline Vector3 Box::extents() const
+    {
+        return (maxCorner - minCorner) * 0.5f;
+    }
+
+    // Return extents of the box.
+    inline float Box::extents(uint axis) const
+    {
+        nvDebugCheck(axis < 3);
+        if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
+        if (axis == 1) return (maxCorner.y - minCorner.y) * 0.5f;
+        if (axis == 2) return (maxCorner.z - minCorner.z) * 0.5f;
+        nvUnreachable();
+        return 0.0f;
+    }
+
+    // Add a point to this box.
+    inline void Box::addPointToBounds(const Vector3 & p)
+    {
+        minCorner = min(minCorner, p);
+        maxCorner = max(maxCorner, p);
+    }
+
+    // Add a box to this box.
+    inline void Box::addBoxToBounds(const Box & b)
+    {
+        minCorner = min(minCorner, b.minCorner);
+        maxCorner = max(maxCorner, b.maxCorner);
+    }
+
+    // Add sphere to this box.
+    inline void Box::addSphereToBounds(const Vector3 & p, float r) {
+        minCorner = min(minCorner, p - Vector3(r));
+        maxCorner = min(maxCorner, p + Vector3(r));
+    }
+
+    // Translate box.
+    inline void Box::translate(const Vector3 & v)
+    {
+        minCorner += v;
+        maxCorner += v;
+    }
+
+    // Scale the box.
+    inline void Box::scale(float s)
+    {
+        minCorner *= s;
+        maxCorner *= s;
+    }
+
+    // Expand the box by a fixed amount.
+    inline void Box::expand(float r) {
+        minCorner -= Vector3(r,r,r);
+        maxCorner += Vector3(r,r,r);
+    }
+
+    // Get the area of the box.
+    inline float Box::area() const
+    {
+        const Vector3 d = extents();
+        return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
+    }	
+
+    // Get the volume of the box.
+    inline float Box::volume() const
+    {
+        Vector3 d = extents();
+        return 8.0f * (d.x * d.y * d.z);
+    }
+
+    // Return true if the box contains the given point.
+    inline bool Box::contains(const Vector3 & p) const
+    {
+        return 
+            minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
+            maxCorner.x > p.x && maxCorner.y > p.y && maxCorner.z > p.z;
+    }
+
+    // Split the given box in 8 octants and assign the ith one to this box.
+    inline void Box::setOctant(const Box & box, const Vector3 & center, int i)
+    {
+        minCorner = box.minCorner;
+        maxCorner = box.maxCorner;
+
+        if (i & 4) minCorner.x = center.x;
+        else       maxCorner.x = center.x;
+        if (i & 2) minCorner.y = center.y;
+        else       maxCorner.y = center.y;
+        if (i & 1) minCorner.z = center.z;
+        else       maxCorner.z = center.z;
+    }
+
+} // nv namespace
+
+
+#endif // NV_MATH_BOX_INL
diff --git a/thirdparty/thekla_atlas/nvmath/Color.h b/thirdparty/thekla_atlas/nvmath/Color.h
new file mode 100644
index 0000000000..5cdc374bd9
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Color.h
@@ -0,0 +1,150 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_COLOR_H
+#define NV_MATH_COLOR_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+
+    /// 64 bit color stored as BGRA.
+    class NVMATH_CLASS Color64 
+    {
+    public:
+        Color64() { }
+        Color64(const Color64 & c) : u(c.u) { }
+        Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
+        explicit Color64(uint64 U) : u(U) { }
+
+        void setRGBA(uint16 R, uint16 G, uint16 B, uint16 A)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }
+
+        operator uint64 () const {
+            return u;
+        }
+
+        union {
+            struct {
+#if NV_LITTLE_ENDIAN
+                uint16 r, a, b, g;
+#else
+                uint16 a: 16;
+                uint16 r: 16;
+                uint16 g: 16;
+                uint16 b: 16;
+#endif
+            };
+            uint64 u;
+        };
+    };
+
+    /// 32 bit color stored as BGRA.
+    class NVMATH_CLASS Color32
+    {
+    public:
+        Color32() { }
+        Color32(const Color32 & c) : u(c.u) { }
+        Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
+        Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
+        //Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
+        //Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
+        //Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
+        explicit Color32(uint32 U) : u(U) { }
+
+        void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }
+
+        void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }
+
+        operator uint32 () const {
+            return u;
+        }
+
+        union {
+            struct {
+#if NV_LITTLE_ENDIAN
+                uint8 b, g, r, a;
+#else
+                uint8 a: 8;
+                uint8 r: 8;
+                uint8 g: 8;
+                uint8 b: 8;
+#endif
+            };
+            uint8 component[4];
+            uint32 u;
+        };
+    };
+
+
+    /// 16 bit 565 BGR color.
+    class NVMATH_CLASS Color16
+    {
+    public:
+        Color16() { }
+        Color16(const Color16 & c) : u(c.u) { }
+        explicit Color16(uint16 U) : u(U) { }
+
+        union {
+            struct {
+#if NV_LITTLE_ENDIAN
+                uint16 b : 5;
+                uint16 g : 6;
+                uint16 r : 5;
+#else
+                uint16 r : 5;
+                uint16 g : 6;
+                uint16 b : 5;
+#endif
+            };
+            uint16 u;
+        };
+    };
+
+    /// 16 bit 4444 BGRA color.
+    class NVMATH_CLASS Color16_4444
+    {
+    public:
+        Color16_4444() { }
+        Color16_4444(const Color16_4444 & c) : u(c.u) { }
+        explicit Color16_4444(uint16 U) : u(U) { }
+
+        union {
+            struct {
+#if NV_LITTLE_ENDIAN
+                uint16 b : 4;
+                uint16 g : 4;
+                uint16 r : 4;
+                uint16 a : 4;
+#else
+                uint16 a : 4;
+                uint16 r : 4;
+                uint16 g : 4;
+                uint16 b : 4;
+#endif
+            };
+            uint16 u;
+        };
+    };
+
+} // nv namespace
+
+#endif // NV_MATH_COLOR_H
diff --git a/thirdparty/thekla_atlas/nvmath/ConvexHull.cpp b/thirdparty/thekla_atlas/nvmath/ConvexHull.cpp
new file mode 100644
index 0000000000..a4a95dace4
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/ConvexHull.cpp
@@ -0,0 +1,120 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "ConvexHull.h"
+
+#include "Vector.inl"
+
+#include "nvcore/RadixSort.h"
+#include "nvcore/Array.inl"
+
+using namespace nv;
+
+inline static float triangleArea(Vector2::Arg v1, Vector2::Arg v2, Vector2::Arg v3)
+{
+    return 0.5f * (v3.x * v1.y + v1.x * v2.y + v2.x * v3.y - v2.x * v1.y - v3.x * v2.y - v1.x * v3.y);
+}
+
+
+// Compute the convex hull using Graham Scan.
+void nv::convexHull(const Array<Vector2> & input, Array<Vector2> & output, float epsilon/*=0*/)
+{
+    const uint inputCount = input.count();
+
+    Array<float> coords;
+    coords.resize(inputCount);
+
+    for (uint i = 0; i < inputCount; i++) {
+        coords[i] = input[i].x;
+    }
+
+    RadixSort radix;
+    radix.sort(coords);
+
+    const uint * ranks = radix.ranks();
+
+    Array<Vector2> top(inputCount);
+    Array<Vector2> bottom(inputCount);
+
+    Vector2 P = input[ranks[0]];
+    Vector2 Q = input[ranks[inputCount-1]];
+
+    float topy = max(P.y, Q.y);
+    float boty = min(P.y, Q.y);
+
+    for (uint i = 0; i < inputCount; i++) {
+        Vector2 p = input[ranks[i]];
+        if (p.y >= boty) top.append(p);
+    }
+
+    for (uint i = 0; i < inputCount; i++) {
+        Vector2 p = input[ranks[inputCount-1-i]];
+        if (p.y <= topy) bottom.append(p);
+    }
+
+    // Filter top list.
+    output.clear();
+    output.append(top[0]);
+    output.append(top[1]);
+
+    for (uint i = 2; i < top.count(); ) {
+        Vector2 a = output[output.count()-2];
+        Vector2 b = output[output.count()-1];
+        Vector2 c = top[i];
+
+        float area = triangleArea(a, b, c);
+
+        if (area >= -epsilon) {
+            output.popBack();
+        }
+
+        if (area < -epsilon || output.count() == 1) {
+            output.append(c);
+            i++;
+        }
+    }
+    
+    uint top_count = output.count();
+    output.append(bottom[1]);
+
+    // Filter bottom list.
+    for (uint i = 2; i < bottom.count(); ) {
+        Vector2 a = output[output.count()-2];
+        Vector2 b = output[output.count()-1];
+        Vector2 c = bottom[i];
+
+        float area = triangleArea(a, b, c);
+
+        if (area >= -epsilon) {
+            output.popBack();
+        }
+
+        if (area < -epsilon || output.count() == top_count) {
+            output.append(c);
+            i++;
+        }
+    }
+
+    // Remove duplicate element.
+    nvDebugCheck(output.front() == output.back());
+    output.popBack();
+}
+
+/*
+void testConvexHull() {
+
+    Array<Vector2> points;
+    points.append(Vector2(1.00, 1.00));
+    points.append(Vector2(0.00, 0.00));
+    points.append(Vector2(1.00, 1.00));
+    points.append(Vector2(1.00, -1.00));
+    points.append(Vector2(2.00, 5.00));
+    points.append(Vector2(-5.00, 3.00));
+    points.append(Vector2(-4.00, -3.00));
+    points.append(Vector2(7.00, -4.00));
+
+    Array<Vector2> hull;
+    convexHull(points, hull);
+
+}
+*/
+
diff --git a/thirdparty/thekla_atlas/nvmath/ConvexHull.h b/thirdparty/thekla_atlas/nvmath/ConvexHull.h
new file mode 100644
index 0000000000..6c2db5d73f
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/ConvexHull.h
@@ -0,0 +1,17 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_CONVEXHULL_H
+#define NV_MATH_CONVEXHULL_H
+
+#include "nvmath.h"
+#include "nvcore/Array.h"
+
+namespace nv {
+    class Vector2;
+ 
+    void convexHull(const Array<Vector2> & input, Array<Vector2> & output, float epsilon = 0);
+
+} // namespace nv
+
+#endif // NV_MATH_CONVEXHULL_H
diff --git a/thirdparty/thekla_atlas/nvmath/Fitting.cpp b/thirdparty/thekla_atlas/nvmath/Fitting.cpp
new file mode 100644
index 0000000000..6cd5cb0f32
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Fitting.cpp
@@ -0,0 +1,1205 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "Fitting.h"
+#include "Vector.inl"
+#include "Plane.inl"
+
+#include "nvcore/Array.inl"
+#include "nvcore/Utils.h" // max, swap
+
+#include <float.h> // FLT_MAX
+//#include <vector>
+#include <string.h>
+
+using namespace nv;
+
+// @@ Move to EigenSolver.h
+
+// @@ We should be able to do something cheaper...
+static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
+{
+	const Vector3 row0(matrix[0], matrix[1], matrix[2]);
+	const Vector3 row1(matrix[1], matrix[3], matrix[4]);
+	const Vector3 row2(matrix[2], matrix[4], matrix[5]);
+
+	float r0 = lengthSquared(row0);
+	float r1 = lengthSquared(row1);
+	float r2 = lengthSquared(row2);
+
+	if (r0 > r1 && r0 > r2) return row0;
+	if (r1 > r2) return row1;
+	return row2;
+}
+
+
+static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    Vector3 v = estimatePrincipalComponent(matrix);
+
+    const int NUM = 8;
+    for (int i = 0; i < NUM; i++)
+    {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+
+        float norm = max(max(x, y), z);
+
+        v = Vector3(x, y, z) / norm;
+    }
+
+    return v;
+}
+
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
+{
+    Vector3 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    Vector3 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points)
+{
+    Vector4 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    Vector4 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.y * v.y;
+        covariance[4] += v.y * v.z;
+        covariance[5] += v.z * v.z;
+    }
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 a = (points[i] - centroid) * metric;
+        Vector3 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.y * b.y;
+        covariance[4] += a.y * b.z;
+        covariance[5] += a.z * b.z;
+    }
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.x * v.w;
+
+		covariance[4] += v.y * v.y;
+        covariance[5] += v.y * v.z;
+        covariance[6] += v.y * v.w;
+
+		covariance[7] += v.z * v.z;
+		covariance[8] += v.z * v.w;
+
+		covariance[9] += v.w * v.w;
+	}
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 a = (points[i] - centroid) * metric;
+        Vector4 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.x * b.w;
+
+		covariance[4] += a.y * b.y;
+        covariance[5] += a.y * b.z;
+        covariance[6] += a.y * b.w;
+
+		covariance[7] += a.z * b.z;
+		covariance[8] += a.z * b.w;
+
+		covariance[9] += a.w * b.w;
+    }
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+
+
+static inline Vector3 firstEigenVector_EigenSolver3(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+	if (!nv::Fit::eigenSolveSymmetric3(matrix, eigenValues, eigenVectors))
+	{
+		return Vector3(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+
+
+static inline Vector4 firstEigenVector_EigenSolver4(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[4] == 0 && matrix[7] == 0&& matrix[9] == 0)
+    {
+        return Vector4(0.0f);
+    }
+
+    float eigenValues[4];
+    Vector4 eigenVectors[4];
+	if (!nv::Fit::eigenSolveSymmetric4(matrix, eigenValues, eigenVectors))
+	{
+		return Vector4(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points)
+{
+    float matrix[10];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    float matrix[10];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R);
+
+Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector3(R[0], R[1], R[2]);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+		Q[i*n+3] = points[i].w;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector4(R[0], R[1], R[2], R[3]);
+}
+
+
+
+Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, matrix);
+
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    return Plane(eigenVectors[2], centroid);
+}
+
+bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        return false;
+    }
+
+    return eigenValues[2] < epsilon;
+}
+
+
+
+// Tridiagonal solver from Charles Bloom. 
+// Householder transforms followed by QL decomposition. 
+// Seems to be based on the code from Numerical Recipes in C.
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd);
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[3];
+    float diag[3];
+    float work[3][3];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[1][1] = matrix[3];
+    work[1][2] = work[2][1] = matrix[4];
+    work[2][2] = matrix[5];
+
+    EigenSolver3_Tridiagonal(work, diag, subd);
+    if (!EigenSolver3_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 3; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector3(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows :
+
+    for (int i=0; i < 3; i++)
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // shuffle to sort by singular value :
+    if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[0], eigenValues[2]);
+        swap(eigenVectors[0], eigenVectors[2]);
+    }
+    if (eigenValues[1] > eigenValues[0])
+    {
+        swap(eigenValues[0], eigenValues[1]);
+        swap(eigenVectors[0], eigenVectors[1]);
+    }
+    if (eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[1], eigenValues[2]);
+        swap(eigenVectors[1], eigenVectors[2]);
+    }
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2]);
+
+    return true;
+}
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+    const float epsilon = 1e-08f;
+
+    float a = mat[0][0];
+    float b = mat[0][1];
+    float c = mat[0][2];
+    float d = mat[1][1];
+    float e = mat[1][2];
+    float f = mat[2][2];
+
+    diag[0] = a;
+    subd[2] = 0.f;
+    if (fabsf(c) >= epsilon)
+    {
+        const float ell = sqrtf(b*b+c*c);
+        b /= ell;
+        c /= ell;
+        const float q = 2*b*e+c*(f-d);
+        diag[1] = d+c*q;
+        diag[2] = f-c*q;
+        subd[0] = ell;
+        subd[1] = e-b*q;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
+        mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
+    }
+    else
+    {
+        diag[1] = d;
+        diag[2] = f;
+        subd[0] = b;
+        subd[1] = e;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
+        mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
+    }
+}
+
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 3; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m <= 1; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 3; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+// Tridiagonal solver for 4x4 symmetric matrices.
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd);
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[4];
+    float diag[4];
+    float work[4][4];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[0][3] = work[3][0] = matrix[3];
+    work[1][1] = matrix[4];
+    work[1][2] = work[2][1] = matrix[5];
+    work[1][3] = work[3][1] = matrix[6];
+    work[2][2] = matrix[7];
+    work[2][3] = work[3][2] = matrix[8];
+    work[3][3] = matrix[9];
+
+    EigenSolver4_Tridiagonal(work, diag, subd);
+    if (!EigenSolver4_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 4; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector4(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 4; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows
+
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // sort by singular value
+
+	for (int i = 0; i < 3; ++i)
+	{
+		for (int j = i+1; j < 4; ++j)
+		{
+			if (eigenValues[j] > eigenValues[i])
+			{
+				swap(eigenValues[i], eigenValues[j]);
+				swap(eigenVectors[i], eigenVectors[j]);
+			}
+		}
+	}
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2] && eigenValues[0] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2] && eigenValues[1] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[2] >= eigenValues[2]);
+
+    return true;
+}
+
+#include "nvmath/Matrix.inl"
+
+inline float signNonzero(float x)
+{
+	return (x >= 0.0f) ? 1.0f : -1.0f;
+}
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+
+	static const int n = 4;
+
+	// Set epsilon relative to size of elements in matrix
+	static const float relEpsilon = 1e-6f;
+	float maxElement = FLT_MAX;
+	for (int i = 0; i < n; ++i)
+		for (int j = 0; j < n; ++j)
+			maxElement = max(maxElement, fabsf(mat[i][j]));
+	float epsilon = relEpsilon * maxElement;
+
+	// Iterative algorithm, works for any size of matrix but might be slower than
+	// a closed-form solution for symmetric 4x4 matrices.  Based on this article:
+	// http://en.wikipedia.org/wiki/Householder_transformation#Tridiagonalization
+
+	Matrix A, Q(identity);
+	memcpy(&A, mat, sizeof(float)*n*n);
+
+	// We proceed from left to right, making the off-tridiagonal entries zero in
+	// one column of the matrix at a time.
+	for (int k = 0; k < n - 2; ++k)
+	{
+		float sum = 0.0f;
+		for (int j = k+1; j < n; ++j)
+			sum += A(j,k)*A(j,k);
+		float alpha = -signNonzero(A(k+1,k)) * sqrtf(sum);
+		float r = sqrtf(0.5f * (alpha*alpha - A(k+1,k)*alpha));
+
+		// If r is zero, skip this column - already in tridiagonal form
+		if (fabsf(r) < epsilon)
+			continue;
+
+		float v[n] = {};
+		v[k+1] = 0.5f * (A(k+1,k) - alpha) / r;
+		for (int j = k+2; j < n; ++j)
+			v[j] = 0.5f * A(j,k) / r;
+
+		Matrix P(identity);
+		for (int i = 0; i < n; ++i)
+			for (int j = 0; j < n; ++j)
+				P(i,j) -= 2.0f * v[i] * v[j];
+
+		A = mul(mul(P, A), P);
+		Q = mul(Q, P);
+	}
+
+	nvDebugCheck(fabsf(A(2,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,2)) < epsilon);
+	nvDebugCheck(fabsf(A(3,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,3)) < epsilon);
+	nvDebugCheck(fabsf(A(3,1)) < epsilon);
+	nvDebugCheck(fabsf(A(1,3)) < epsilon);
+
+	for (int i = 0; i < n; ++i)
+		diag[i] = A(i,i);
+	for (int i = 0; i < n - 1; ++i)
+		subd[i] = A(i+1,i);
+	subd[n-1] = 0.0f;
+
+	memcpy(mat, &Q, sizeof(float)*n*n);
+}
+
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 4; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m < 3; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 4; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
+{
+    // Compute principal component.
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
+    Vector3 principal = firstEigenVector_PowerMethod(matrix);
+
+    // Pick initial solution.
+    int mini, maxi;
+    mini = maxi = 0;
+
+    float mindps, maxdps;
+    mindps = maxdps = dot(points[0] - centroid, principal);
+
+    for (int i = 1; i < n; ++i)
+    {
+        float dps = dot(points[i] - centroid, principal);
+
+        if (dps < mindps) {
+            mindps = dps;
+            mini = i;
+        }
+        else {
+            maxdps = dps;
+            maxi = i;
+        }
+    }
+
+    cluster[0] = centroid + mindps * principal;
+    cluster[1] = centroid + maxdps * principal;
+    cluster[2] = (2.0f * cluster[0] + cluster[1]) / 3.0f;
+    cluster[3] = (2.0f * cluster[1] + cluster[0]) / 3.0f;
+
+    // Now we have to iteratively refine the clusters.
+    while (true)
+    {
+        Vector3 newCluster[4] = { Vector3(0.0f), Vector3(0.0f), Vector3(0.0f), Vector3(0.0f) };
+        float total[4] = {0, 0, 0, 0};
+
+        for (int i = 0; i < n; ++i)
+        {
+            // Find nearest cluster.
+            int nearest = 0;
+            float mindist = FLT_MAX;
+            for (int j = 0; j < 4; j++)
+            {
+                float dist = lengthSquared((cluster[j] - points[i]) * metric);
+                if (dist < mindist)
+                {
+                    mindist = dist;
+                    nearest = j;
+                }
+            }
+
+            newCluster[nearest] += weights[i] * points[i];
+            total[nearest] += weights[i];
+        }
+
+        for (int j = 0; j < 4; j++)
+        {
+            if (total[j] != 0)
+                newCluster[j] /= total[j];
+        }
+
+        if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
+            equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
+        {
+            return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
+        }
+
+        cluster[0] = newCluster[0];
+        cluster[1] = newCluster[1];
+        cluster[2] = newCluster[2];
+        cluster[3] = newCluster[3];
+
+        // Sort clusters by weight.
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
+            {
+                swap( total[j], total[j - 1] );
+                swap( cluster[j], cluster[j - 1] );
+            }
+        }
+    }
+}
+
+
+
+// Adaptation of James Arvo's SVD code, as found in ZOH.
+
+inline float Sqr(float x) { return x*x; }
+
+inline float svd_pythag( float a, float b )
+{
+	float at = fabsf(a);
+	float bt = fabsf(b);
+	if( at > bt )
+		return at * sqrtf( 1.0f + Sqr( bt / at ) );
+	else if( bt > 0.0f )
+		return bt * sqrtf( 1.0f + Sqr( at / bt ) );
+	else return 0.0f;
+}
+
+inline float SameSign( float a, float b ) 
+{
+	float t;
+	if( b >= 0.0f ) t = fabsf( a );
+	else t = -fabsf( a );
+	return t;
+}
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R)
+{
+	static const int MaxIterations = 30;
+
+	int    i, j, k, l, p, q, iter;
+	float  c, f, h, s, x, y, z;
+	float  norm  = 0.0f;
+	float  g     = 0.0f;
+	float  scale = 0.0f;
+
+    Array<float> temp; temp.resize(cols, 0.0f);
+
+	for( i = 0; i < cols; i++ ) 
+	{
+		temp[i] = scale * g;
+		scale   = 0.0f;
+		g       = 0.0f;
+		s       = 0.0f;
+		l       = i + 1;
+
+		if( i < rows )
+		{
+			for( k = i; k < rows; k++ ) scale += fabsf( Q[k*cols+i] );
+			if( scale != 0.0f ) 
+			{
+				for( k = i; k < rows; k++ ) 
+				{
+					Q[k*cols+i] /= scale;
+					s += Sqr( Q[k*cols+i] );
+				}
+				f = Q[i*cols+i];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+i] = f - g;
+				if( i != cols - 1 )
+				{
+					for( j = l; j < cols; j++ ) 
+					{
+						s = 0.0f;
+						for( k = i; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+						f = s / h;
+						for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+					}
+				}
+				for( k = i; k < rows; k++ ) Q[k*cols+i] *= scale;
+			}
+		}
+
+		diag[i] = scale * g;
+		g       = 0.0f;
+		s       = 0.0f;
+		scale   = 0.0f;
+
+		if( i < rows && i != cols - 1 ) 
+		{
+			for( k = l; k < cols; k++ ) scale += fabsf( Q[i*cols+k] );
+			if( scale != 0.0f ) 
+			{
+				for( k = l; k < cols; k++ ) 
+				{
+					Q[i*cols+k] /= scale;
+					s += Sqr( Q[i*cols+k] );
+				}
+				f = Q[i*cols+l];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+l] = f - g;
+				for( k = l; k < cols; k++ ) temp[k] = Q[i*cols+k] / h;
+				if( i != rows - 1 ) 
+				{
+					for( j = l; j < rows; j++ ) 
+					{
+						s = 0.0f;
+						for( k = l; k < cols; k++ ) s += Q[j*cols+k] * Q[i*cols+k];
+						for( k = l; k < cols; k++ ) Q[j*cols+k] += s * temp[k];
+					}
+				}
+				for( k = l; k < cols; k++ ) Q[i*cols+k] *= scale;
+			}
+		}
+		norm = max( norm, fabsf( diag[i] ) + fabsf( temp[i] ) );
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		if( i < cols - 1 ) 
+		{
+			if( g != 0.0f ) 
+			{
+				for( j = l; j < cols; j++ ) R[i*cols+j] = ( Q[i*cols+j] / Q[i*cols+l] ) / g;
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < cols; k++ ) s += Q[i*cols+k] * R[j*cols+k];
+					for( k = l; k < cols; k++ ) R[j*cols+k] += s * R[i*cols+k];
+				}
+			}
+			for( j = l; j < cols; j++ ) 
+			{
+				R[i*cols+j] = 0.0f;
+				R[j*cols+i] = 0.0f;
+			}
+		}
+		R[i*cols+i] = 1.0f;
+		g = temp[i];
+		l = i;
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		l = i + 1;
+		g = diag[i];
+		if( i < cols - 1 ) for( j = l; j < cols; j++ ) Q[i*cols+j] = 0.0f;
+		if( g != 0.0f ) 
+		{
+			g = 1.0f / g;
+			if( i != cols - 1 ) 
+			{
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+					f = ( s / Q[i*cols+i] ) * g;
+					for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+				}
+			}
+			for( j = i; j < rows; j++ ) Q[j*cols+i] *= g;
+		} 
+		else 
+		{
+			for( j = i; j < rows; j++ ) Q[j*cols+i] = 0.0f;
+		}
+		Q[i*cols+i] += 1.0f;
+	}
+
+
+	for( k = cols - 1; k >= 0; k-- ) 
+	{
+		for( iter = 1; iter <= MaxIterations; iter++ ) 
+		{
+			int jump = 0;
+
+			for( l = k; l >= 0; l-- )
+			{
+				q = l - 1;
+				if( fabsf( temp[l] ) + norm == norm ) { jump = 1; break; }
+				if( fabsf( diag[q] ) + norm == norm ) { jump = 0; break; }
+			}
+
+			if( !jump )
+			{
+				c = 0.0f;
+				s = 1.0f;
+				for( i = l; i <= k; i++ )
+				{
+					f = s * temp[i];
+					temp[i] *= c;
+					if( fabsf( f ) + norm == norm ) break;
+					g = diag[i];
+					h = svd_pythag( f, g );
+					diag[i] = h;
+					h = 1.0f / h;
+					c = g * h;
+					s = -f * h;
+					for( j = 0; j < rows; j++ ) 
+					{
+						y = Q[j*cols+q];
+						z = Q[j*cols+i];
+						Q[j*cols+q] = y * c + z * s;
+						Q[j*cols+i] = z * c - y * s;
+					}
+				}
+			}
+
+			z = diag[k];
+			if( l == k ) 
+			{
+				if( z < 0.0f ) 
+				{
+					diag[k] = -z;
+					for( j = 0; j < cols; j++ ) R[k*cols+j] *= -1.0f; 
+				}
+				break;
+			}
+			if( iter >= MaxIterations ) return;
+			x = diag[l];
+			q = k - 1;
+			y = diag[q];
+			g = temp[q];
+			h = temp[k];
+			f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0f * h * y );
+			g = svd_pythag( f, 1.0f );
+			f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x;
+			c = 1.0f;
+			s = 1.0f;
+			for( j = l; j <= q; j++ ) 
+			{
+				i = j + 1;
+				g = temp[i];
+				y = diag[i];
+				h = s * g;
+				g = c * g;
+				z = svd_pythag( f, h );
+				temp[j] = z;
+				c = f / z;
+				s = h / z;
+				f = x * c + g * s;
+				g = g * c - x * s;
+				h = y * s;
+				y = y * c;
+				for( p = 0; p < cols; p++ ) 
+				{
+					x = R[j*cols+p];
+					z = R[i*cols+p];
+					R[j*cols+p] = x * c + z * s;
+					R[i*cols+p] = z * c - x * s;
+				}
+				z = svd_pythag( f, h );
+				diag[j] = z;
+				if( z != 0.0f ) 
+				{
+					z = 1.0f / z;
+					c = f * z;
+					s = h * z;
+				}
+				f = c * g + s * y;
+				x = c * y - s * g;
+				for( p = 0; p < rows; p++ ) 
+				{
+					y = Q[p*cols+j];
+					z = Q[p*cols+i];
+					Q[p*cols+j] = y * c + z * s;
+					Q[p*cols+i] = z * c - y * s;
+				}
+			}
+			temp[l] = 0.0f;
+			temp[k] = f;
+			diag[k] = x;
+		}
+	}
+
+	// Sort the singular values into descending order.
+
+	for( i = 0; i < cols - 1; i++ )
+	{
+		float biggest = diag[i];  // Biggest singular value so far.
+		int   bindex  = i;        // The row/col it occurred in.
+		for( j = i + 1; j < cols; j++ )
+		{
+			if( diag[j] > biggest ) 
+			{
+				biggest = diag[j];
+				bindex  = j;
+			}            
+		}
+		if( bindex != i )  // Need to swap rows and columns.
+		{
+			// Swap columns in Q.
+			for (int j = 0; j < rows; ++j)
+				swap(Q[j*cols+i], Q[j*cols+bindex]);
+
+			// Swap rows in R.
+			for (int j = 0; j < rows; ++j)
+				swap(R[i*cols+j], R[bindex*cols+j]);
+
+			// Swap elements in diag.
+			swap(diag[i], diag[bindex]);
+		}
+	}
+}
diff --git a/thirdparty/thekla_atlas/nvmath/Fitting.h b/thirdparty/thekla_atlas/nvmath/Fitting.h
new file mode 100644
index 0000000000..7a88cd28fd
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Fitting.h
@@ -0,0 +1,50 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_FITTING_H
+#define NV_MATH_FITTING_H
+
+#include "Vector.h"
+#include "Plane.h"
+
+namespace nv
+{
+    namespace Fit
+    {
+        Vector3 computeCentroid(int n, const Vector3 * points);
+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector4 computeCentroid(int n, const Vector4 * points);
+        Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);
+
+        Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
+        Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
+
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+		Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
+        Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
+        Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
+
+        Plane bestPlane(int n, const Vector3 * points);
+        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
+
+        bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
+        bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);
+
+        // Returns number of clusters [1-4].
+        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_FITTING_H
diff --git a/thirdparty/thekla_atlas/nvmath/KahanSum.h b/thirdparty/thekla_atlas/nvmath/KahanSum.h
new file mode 100644
index 0000000000..18d475e7cb
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/KahanSum.h
@@ -0,0 +1,39 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_KAHANSUM_H
+#define NV_MATH_KAHANSUM_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+
+    class KahanSum
+    {
+    public:
+        KahanSum() : accum(0.0f), err(0) {};
+
+        void add(float f)
+        {
+            float compensated = f + err;
+            float tmp = accum + compensated;
+            err = accum - tmp;
+            err += compensated;
+            accum = tmp;
+        }
+
+        float sum() const
+        {
+            return accum;
+        }
+
+    private:
+        float accum;
+        float err;
+    };
+
+} // nv namespace
+
+
+#endif // NV_MATH_KAHANSUM_H
diff --git a/thirdparty/thekla_atlas/nvmath/Matrix.cpp b/thirdparty/thekla_atlas/nvmath/Matrix.cpp
new file mode 100644
index 0000000000..29bd19f5f8
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Matrix.cpp
@@ -0,0 +1,441 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Matrix.inl"
+#include "Vector.inl"
+
+#include "nvcore/Array.inl"
+
+#include <float.h>
+
+#if !NV_CC_MSVC && !NV_OS_ORBIS
+#include <alloca.h>
+#endif
+
+using namespace nv;
+
+
+// Given a matrix a[1..n][1..n], this routine replaces it by the LU decomposition of a rowwise
+// permutation of itself. a and n are input. a is output, arranged as in equation (2.3.14) above;
+// indx[1..n] is an output vector that records the row permutation effected by the partial
+// pivoting; d is output as -1 depending on whether the number of row interchanges was even
+// or odd, respectively. This routine is used in combination with lubksb to solve linear equations
+// or invert a matrix.
+static bool ludcmp(float **a, int n, int *indx, float *d)
+{
+    const float TINY = 1.0e-20f;
+
+    float * vv = (float*)alloca(sizeof(float) * n);    // vv stores the implicit scaling of each row.
+
+    *d = 1.0; // No row interchanges yet.
+    for (int i = 0; i < n; i++) { // Loop over rows to get the implicit scaling information.
+    
+        float big = 0.0;
+        for (int j = 0; j < n; j++) {
+            big = max(big, fabsf(a[i][j]));
+        }
+        if (big == 0) {
+            return false;   // Singular matrix
+        }
+        
+        // No nonzero largest element.
+        vv[i] = 1.0f / big; // Save the scaling.
+    }
+
+    for (int j = 0; j < n; j++) {       // This is the loop over columns of Crout's method.
+        for (int i = 0; i < j; i++) {   // This is equation (2.3.12) except for i = j.
+            float sum = a[i][j];
+            for (int k = 0; k < i; k++) sum -= a[i][k]*a[k][j];
+            a[i][j] = sum;
+        }
+
+        int imax = -1;
+        float big = 0.0;                // Initialize for the search for largest pivot element.
+        for (int i = j; i < n; i++) {   // This is i = j of equation (2.3.12) and i = j+ 1 : : : N
+            float sum = a[i][j];              // of equation (2.3.13).
+            for (int k = 0; k < j; k++) {
+                sum -= a[i][k]*a[k][j];
+            }
+            a[i][j]=sum;
+
+            float dum = vv[i]*fabs(sum);
+            if (dum >= big) {
+                // Is the figure of merit for the pivot better than the best so far?
+                big = dum;
+                imax = i;
+            }
+        }
+        nvDebugCheck(imax != -1);
+
+        if (j != imax) {                // Do we need to interchange rows?
+            for (int k = 0; k < n; k++) {   // Yes, do so...
+                swap(a[imax][k], a[j][k]);
+            }
+            *d = -(*d); // ...and change the parity of d.
+            vv[imax]=vv[j]; // Also interchange the scale factor.
+        }
+
+        indx[j]=imax;
+        if (a[j][j] == 0.0) a[j][j] = TINY;
+        
+        // If the pivot element is zero the matrix is singular (at least to the precision of the
+        // algorithm). For some applications on singular matrices, it is desirable to substitute
+        // TINY for zero.
+        if (j != n-1) { // Now, finally, divide by the pivot element.
+            float dum = 1.0f / a[j][j];
+            for (int i = j+1; i < n; i++) a[i][j] *= dum;
+        }
+    } // Go back for the next column in the reduction.
+
+    return true;
+}
+
+
+// Solves the set of n linear equations Ax = b. Here a[1..n][1..n] is input, not as the matrix
+// A but rather as its LU decomposition, determined by the routine ludcmp. indx[1..n] is input
+// as the permutation vector returned by ludcmp. b[1..n] is input as the right-hand side vector
+// B, and returns with the solution vector X. a, n, and indx are not modified by this routine
+// and can be left in place for successive calls with different right-hand sides b. This routine takes
+// into account the possibility that b will begin with many zero elements, so it is efficient for use
+// in matrix inversion.
+static void lubksb(float **a, int n, int *indx, float b[])
+{
+    int ii = 0;
+    for (int i=0; i<n; i++) {   // When ii is set to a positive value, it will become 
+        int ip = indx[i];       // the index of the first nonvanishing element of b. We now 
+        float sum = b[ip];      // do the forward substitution, equation (2.3.6). The 
+        b[ip] = b[i];           // only new wrinkle is to unscramble the permutation as we go.
+        if (ii != 0) {
+            for (int j = ii-1; j < i; j++) sum -= a[i][j]*b[j];
+        }
+        else if (sum != 0.0f) {
+            ii = i+1;             // A nonzero element was encountered, so from now on we 
+        }
+        b[i] = sum;             // will have to do the sums in the loop above.
+    }
+    for (int i=n-1; i>=0; i--) {  // Now we do the backsubstitution, equation (2.3.7).
+        float sum = b[i];
+        for (int j = i+1; j < n; j++) {
+            sum -= a[i][j]*b[j];
+        }
+        b[i] = sum/a[i][i];     // Store a component of the solution vector X.
+    } // All done!
+}
+
+
+bool nv::solveLU(const Matrix & A, const Vector4 & b, Vector4 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    float m[4][4];
+    float *a[4] = {m[0], m[1], m[2], m[3]};
+    int idx[4];
+    float d;
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            a[x][y] = A(x, y);
+        }
+    }
+
+    // Create LU decomposition.
+    if (!ludcmp(a, 4, idx, &d)) {
+        // Singular matrix.
+        return false;
+    }
+
+    // Init solution.
+    *x = b;
+
+    // Do back substitution.
+    lubksb(a, 4, idx, x->component);
+
+    return true;
+}
+
+// @@ Not tested.
+Matrix nv::inverseLU(const Matrix & A)
+{
+    Vector4 Ai[4];
+
+    solveLU(A, Vector4(1, 0, 0, 0), &Ai[0]);
+    solveLU(A, Vector4(0, 1, 0, 0), &Ai[1]);
+    solveLU(A, Vector4(0, 0, 1, 0), &Ai[2]);
+    solveLU(A, Vector4(0, 0, 0, 1), &Ai[3]);
+
+    return Matrix(Ai[0], Ai[1], Ai[2], Ai[3]);
+}
+
+
+
+bool nv::solveLU(const Matrix3 & A, const Vector3 & b, Vector3 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    float m[3][3];
+    float *a[3] = {m[0], m[1], m[2]};
+    int idx[3];
+    float d;
+
+    for (int y = 0; y < 3; y++) {
+        for (int x = 0; x < 3; x++) {
+            a[x][y] = A(x, y);
+        }
+    }
+
+    // Create LU decomposition.
+    if (!ludcmp(a, 3, idx, &d)) {
+        // Singular matrix.
+        return false;
+    }
+
+    // Init solution.
+    *x = b;
+
+    // Do back substitution.
+    lubksb(a, 3, idx, x->component);
+
+    return true;
+}
+
+
+bool nv::solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    *x = transform(inverseCramer(A), b);
+    
+    return true; // @@ Return false if determinant(A) == 0 !
+}
+
+bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    const float det = A.determinant();
+    if (equal(det, 0.0f)) {   // @@ Use input epsilon.
+        return false;
+    }
+
+    Matrix3 Ai = inverseCramer(A);
+
+    *x = transform(Ai, b);
+    
+    return true;
+}
+
+
+
+// Inverse using gaussian elimination. From Jon's code.
+Matrix nv::inverse(const Matrix & m) {
+
+    Matrix A = m;
+    Matrix B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<4; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<4; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<4; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<4; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<4; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+Matrix3 nv::inverse(const Matrix3 & m) {
+
+    Matrix3 A = m;
+    Matrix3 B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<3; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<3; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<3; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<3; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<3; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<3; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<3; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<3; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<3; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=3-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<3; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+
+
+
+#if 0 
+
+// Copyright (C) 1999-2004 Michael Garland.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, and/or sell copies of the Software, and to permit persons
+// to whom the Software is furnished to do so, provided that the above
+// copyright notice(s) and this permission notice appear in all copies of
+// the Software and that both the above copyright notice(s) and this
+// permission notice appear in supporting documentation.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+// HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
+// INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
+// FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+// NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+// WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// 
+// Except as contained in this notice, the name of a copyright holder
+// shall not be used in advertising or otherwise to promote the sale, use
+// or other dealings in this Software without prior written authorization
+// of the copyright holder.
+
+
+// Matrix inversion code for 4x4 matrices using Gaussian elimination
+// with partial pivoting.  This is a specialized version of a
+// procedure originally due to Paul Heckbert <ph@cs.cmu.edu>.
+//
+// Returns determinant of A, and B=inverse(A)
+// If matrix A is singular, returns 0 and leaves trash in B.
+//
+#define SWAP(a, b, t)   {t = a; a = b; b = t;}
+double invert(Mat4& B, const Mat4& m)
+{
+    Mat4 A = m;
+    int i, j, k;
+    double max, t, det, pivot;
+
+    /*---------- forward elimination ----------*/
+
+    for (i=0; i<4; i++)                 /* put identity matrix in B */
+        for (j=0; j<4; j++)
+            B(i, j) = (double)(i==j);
+
+    det = 1.0;
+    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<4; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return 0.;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<4; k++)
+                SWAP(A(i, k), A(j, k), t);
+            for (k=0; k<4; k++)
+                SWAP(B(i, k), B(j, k), t);
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<4; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<4; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return det;
+}
+
+#endif // 0
+
+
+
diff --git a/thirdparty/thekla_atlas/nvmath/Matrix.h b/thirdparty/thekla_atlas/nvmath/Matrix.h
new file mode 100644
index 0000000000..506bdad1ca
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Matrix.h
@@ -0,0 +1,113 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_MATRIX_H
+#define NV_MATH_MATRIX_H
+
+#include "Vector.h"
+
+// - Matrices are stored in memory in *column major* order.
+// - Points are to be though of as column vectors.
+// - Transformation of a point p by a matrix M is: p' = M * p
+
+namespace nv
+{
+    enum identity_t { identity };
+
+    // 3x3 matrix.
+    class NVMATH_CLASS Matrix3
+    {
+    public:
+        Matrix3();
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
+        Matrix3(const Matrix3 & m);
+        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+
+        Vector3 row(uint i) const;
+        Vector3 column(uint i) const;
+
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator+=(const Matrix3 & m);
+        void operator-=(const Matrix3 & m);
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        float determinant() const;
+
+    private:
+        float m_data[9];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
+
+
+    // 4x4 matrix.
+    class NVMATH_CLASS Matrix
+    {
+    public:
+        typedef Matrix const & Arg;
+
+        Matrix();
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
+        Matrix(const Matrix3 & m);
+        Matrix(const Matrix & m);
+        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
+        //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+        const float * ptr() const;
+
+        Vector4 row(uint i) const;
+        Vector4 column(uint i) const;
+
+        void zero();
+        void identity();
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        void translate(Vector3::Arg t);
+        void rotate(float theta, float v0, float v1, float v2);
+        float determinant() const;
+
+        void operator+=(const Matrix & m);
+        void operator-=(const Matrix & m);
+
+        void apply(Matrix::Arg m);
+
+    private:
+        float m_data[16];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Compute inverse using LU decomposition.
+    extern Matrix inverseLU(const Matrix & m);
+
+    // Compute inverse using Gaussian elimination and partial pivoting.
+    extern Matrix inverse(const Matrix & m);
+    extern Matrix3 inverse(const Matrix3 & m);
+
+} // nv namespace
+
+#endif // NV_MATH_MATRIX_H
diff --git a/thirdparty/thekla_atlas/nvmath/Matrix.inl b/thirdparty/thekla_atlas/nvmath/Matrix.inl
new file mode 100644
index 0000000000..c0d99d9fe0
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Matrix.inl
@@ -0,0 +1,1274 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_MATRIX_INL
+#define NV_MATH_MATRIX_INL
+
+#include "Matrix.h"
+
+namespace nv
+{
+    inline Matrix3::Matrix3() {}
+    
+    inline Matrix3::Matrix3(float f)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = f;
+        }
+    }
+
+    inline Matrix3::Matrix3(identity_t)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                m_data[3*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix3::Matrix3(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+    
+    inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
+    {
+        m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z;
+        m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z;
+        m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z;
+    }
+
+    inline float Matrix3::data(uint idx) const
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float & Matrix3::data(uint idx)
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float Matrix3::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float Matrix3::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float & Matrix3::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+
+    inline Vector3 Matrix3::row(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(i, 0), get(i, 1), get(i, 2));
+    }
+    inline Vector3 Matrix3::column(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(0, i), get(1, i), get(2, i));
+    }
+
+    inline void Matrix3::operator*=(float s)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::operator/=(float s)
+    {
+        float is = 1.0f /s;
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= is;
+        }
+    }
+
+    inline void Matrix3::operator+=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix3::operator-=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m -= b;
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator*(float s, const Matrix3 & a)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator/(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m /= s;
+        return m;
+    }
+
+    inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m;
+
+        for(int i = 0; i < 3; i++) {
+            const float ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2);
+            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0);
+            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1);
+            m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2);
+        }
+
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b)
+    {
+        return mul(a, b);
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transform(const Matrix3 & m, const Vector3 & p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    inline void Matrix3::scale(float s)
+    {
+        for (int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x;
+        m_data[3] *= s.y; m_data[4] *= s.y; m_data[5] *= s.y;
+        m_data[6] *= s.z; m_data[7] *= s.z; m_data[8] *= s.z;
+    }
+
+    inline float Matrix3::determinant() const
+    {
+        return 
+            get(0,0) * get(1,1) * get(2,2) + 
+            get(0,1) * get(1,2) * get(2,0) + 
+            get(0,2) * get(1,0) * get(2,1) -
+            get(0,2) * get(1,1) * get(2,0) - 
+            get(0,1) * get(1,0) * get(2,2) -
+            get(0,0) * get(1,2) * get(2,1);
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix3 inverseCramer(const Matrix3 & m)
+    {
+        const float det = m.determinant();
+        if (equal(det, 0.0f, 0.0f)) {
+            return Matrix3(0);
+        }
+
+        Matrix3 r;
+
+        r.data(0) =  - m.data(5) * m.data(7) + m.data(4) * m.data(8);
+        r.data(1) =  + m.data(5) * m.data(6) - m.data(3) * m.data(8);
+        r.data(2) =  - m.data(4) * m.data(6) + m.data(3) * m.data(7);
+
+        r.data(3) =  + m.data(2) * m.data(7) - m.data(1) * m.data(8);
+        r.data(4) =  - m.data(2) * m.data(6) + m.data(0) * m.data(8);
+        r.data(5) =  + m.data(1) * m.data(6) - m.data(0) * m.data(7);
+
+        r.data(6) =  - m.data(2) * m.data(4) + m.data(1) * m.data(5);
+        r.data(7) =  + m.data(2) * m.data(3) - m.data(0) * m.data(5);
+        r.data(8) =  - m.data(1) * m.data(3) + m.data(0) * m.data(4);
+
+        r.scale(1.0f / det);
+
+        return r;
+    }
+
+
+
+    inline Matrix::Matrix()
+    {
+    }
+
+    inline Matrix::Matrix(float f)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = 0.0f;
+        }
+    }
+
+    inline Matrix::Matrix(identity_t)
+    {
+        for(int i = 0; i < 4; i++) {
+            for(int j = 0; j < 4; j++) {
+                m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix3 & m)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                operator()(i, j) = m.get(i, j);
+            }
+        }
+        for(int i = 0; i < 4; i++) {
+            operator()(3, i) = 0;
+            operator()(i, 3) = 0;
+        }
+    }
+
+    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
+    {
+        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
+        m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w;
+        m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w;
+        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
+    }
+
+    /*inline Matrix::Matrix(const float m[])
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m[i];
+        }
+    }*/
+
+
+    // Accessors
+    inline float Matrix::data(uint idx) const
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float & Matrix::data(uint idx)
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float Matrix::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float Matrix::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float & Matrix::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+
+    inline const float * Matrix::ptr() const
+    {
+        return m_data;
+    }
+
+    inline Vector4 Matrix::row(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));
+    }
+
+    inline Vector4 Matrix::column(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));
+    }
+
+    inline void Matrix::zero()
+    {
+        m_data[0] = 0; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 0; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 0; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 0;
+    }
+
+    inline void Matrix::identity()
+    {
+        m_data[0] = 1; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 1; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 1; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 1;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(float s)
+    {
+        m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;
+        m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;
+        m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;
+        m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x;
+        m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y;
+        m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z;
+    }
+
+    // Apply translation.
+    inline void Matrix::translate(Vector3::Arg t)
+    {
+        m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8]  * t.z + m_data[12];
+        m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9]  * t.z + m_data[13];
+        m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14];
+        m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15];
+    }
+
+    Matrix rotation(float theta, float v0, float v1, float v2);
+
+    // Apply rotation.
+    inline void Matrix::rotate(float theta, float v0, float v1, float v2)
+    {
+        Matrix R(rotation(theta, v0, v1, v2));
+        apply(R);
+    }
+
+    // Apply transform.
+    inline void Matrix::apply(Matrix::Arg m)
+    {
+        nvDebugCheck(this != &m);
+
+        for(int i = 0; i < 4; i++) {
+            const float ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);
+            m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);
+            m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);
+            m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);
+            m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);
+        }
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(Vector3::Arg s)
+    {
+        Matrix m(identity);
+        m(0,0) = s.x;
+        m(1,1) = s.y;
+        m(2,2) = s.z;
+        return m;
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(float s)
+    {
+        Matrix m(identity);
+        m(0,0) = m(1,1) = m(2,2) = s;
+        return m;
+    }
+
+    // Get translation matrix.
+    inline Matrix translation(Vector3::Arg t)
+    {
+        Matrix m(identity);
+        m(0,3) = t.x;
+        m(1,3) = t.y;
+        m(2,3) = t.z;
+        return m;
+    }
+
+    // Get rotation matrix.
+    inline Matrix rotation(float theta, float v0, float v1, float v2)
+    {
+        float cost = cosf(theta);
+        float sint = sinf(theta);
+
+        Matrix m(identity);
+
+        if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+            m(1,1) = cost; m(2,1) = -sint;
+            m(1,2) = sint; m(2,2) = cost;
+        }
+        else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+            m(0,0) = cost; m(2,0) = sint;
+            m(1,2) = -sint; m(2,2) = cost;
+        }
+        else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+            m(0,0) = cost; m(1,0) = -sint;
+            m(0,1) = sint; m(1,1) = cost;
+        } 
+        else {
+            float a2, b2, c2;
+            a2 = v0 * v0;
+            b2 = v1 * v1;
+            c2 = v2 * v2;
+
+            float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+            v0 *= iscale;
+            v1 *= iscale;
+            v2 *= iscale;
+
+            float abm, acm, bcm;
+            float mcos, asin, bsin, csin;
+            mcos = 1.0f - cost;
+            abm = v0 * v1 * mcos;
+            acm = v0 * v2 * mcos;
+            bcm = v1 * v2 * mcos;
+            asin = v0 * sint;
+            bsin = v1 * sint;
+            csin = v2 * sint;
+            m(0,0) = a2 * mcos + cost;
+            m(1,0) = abm - csin;
+            m(2,0) = acm + bsin;
+            m(3,0) = abm + csin;
+            m(1,1) = b2 * mcos + cost;
+            m(2,1) = bcm - asin;
+            m(3,1) = acm - bsin;
+            m(1,2) = bcm + asin;
+            m(2,2) = c2 * mcos + cost;
+        }
+        return m;
+    }
+
+    //Matrix rotation(float yaw, float pitch, float roll);
+    //Matrix skew(float angle, Vector3::Arg v1, Vector3::Arg v2);
+
+    // Get frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float one_deltaz = 1.0f / (zFar - zNear);
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -(zFar + zNear) * one_deltaz;
+        m(3,2) = -1.0f;
+        m(2,3) = -(zFar * doubleznear) * one_deltaz;
+
+        return m;
+    }
+
+    // Get inverse frustum matrix.
+    inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float one_doubleznear = 1.0f / (2.0f * zNear);
+        float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar);
+
+        m(0,0) = (xmax - xmin) * one_doubleznear;
+        m(0,3) = (xmax + xmin) * one_doubleznear;
+        m(1,1) = (ymax - ymin) * one_doubleznear;
+        m(1,3) = (ymax + ymin) * one_doubleznear;
+        m(2,3) = -1;
+        m(3,2) = -(zFar - zNear) * one_doubleznearzfar;
+        m(3,3) = (zFar + zNear) * one_doubleznearzfar;
+
+        return m;
+    }
+
+    // Get infinite frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float nudge = 1.0; // 0.999;
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -1.0f * nudge;
+        m(3,2) = -1.0f;
+        m(2,3) = -doubleznear * nudge;
+
+        return m;
+    }
+
+    // Get perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tan(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get inverse perspective matrix.
+    inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tan(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get infinite perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear)
+    {
+        float x = zNear * tan(fovy / 2);
+        float y = x / aspect;
+        return frustum( -x, x, -y, y, zNear );	
+    }
+
+    // Get matrix determinant.
+    inline float Matrix::determinant() const
+    {
+        return 
+            m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +
+            m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +
+            m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +
+            m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +
+            m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +
+            m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];
+    }
+
+    inline Matrix transpose(Matrix::Arg m)
+    {
+        Matrix r;
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+        return r;
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix inverseCramer(Matrix::Arg m)
+    {
+        Matrix r;
+        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
+        r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);
+        r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);
+        r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);
+        r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);
+        r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);
+        r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);
+        r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);
+        r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);
+        r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);
+        r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);
+        r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);
+        r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);
+        r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);
+        r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);
+        r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);
+        r.scale(1.0f / m.determinant());
+        return r;
+    }
+
+    inline Matrix isometryInverse(Matrix::Arg m)
+    {
+        Matrix r(identity);
+
+        // transposed 3x3 upper left matrix
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+
+        // translate by the negative offsets
+        r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));
+
+        return r;
+    }
+
+    // Transform the given 3d point with the given matrix.
+    inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3));
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    // Transform the given 4d vector with the given matrix.
+    inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)
+    {
+        return Vector4(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3),
+            p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3));
+    }
+
+    inline Matrix mul(Matrix::Arg a, Matrix::Arg b)
+    {
+        // @@ Is this the right order? mul(a, b) = b * a
+        Matrix m = a;
+        m.apply(b);
+        return m;
+    }
+
+    inline void Matrix::operator+=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix::operator-=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix operator+(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix operator-(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m -= b;
+        return m;
+    }
+
+
+} // nv namespace
+
+
+#if 0 // old code.
+/** @name Special matrices. */
+//@{
+/** Generate a translation matrix. */
+void TranslationMatrix(const Vec3 & v) {
+    data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;
+    data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;
+    data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;
+    data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;
+}
+
+/** Rotate theta degrees around v. */
+void RotationMatrix( float theta, float v0, float v1, float v2 ) {
+    float cost = cos(theta);
+    float sint = sin(theta);
+
+    if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+        data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+        data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    } 
+    else {
+        //we need scale a,b,c to unit length.
+        float a2, b2, c2;
+        a2 = v0 * v0;
+        b2 = v1 * v1;
+        c2 = v2 * v2;
+
+        float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+        v0 *= iscale;
+        v1 *= iscale;
+        v2 *= iscale;
+
+        float abm, acm, bcm;
+        float mcos, asin, bsin, csin;
+        mcos = 1.0f - cost;
+        abm = v0 * v1 * mcos;
+        acm = v0 * v2 * mcos;
+        bcm = v1 * v2 * mcos;
+        asin = v0 * sint;
+        bsin = v1 * sint;
+        csin = v2 * sint;
+        data[0] = a2 * mcos + cost;
+        data[1] = abm - csin;
+        data[2] = acm + bsin;
+        data[3] = abm + csin;
+        data[4] = 0.0f;
+        data[5] = b2 * mcos + cost;
+        data[6] = bcm - asin;
+        data[7] = acm - bsin;
+        data[8] = 0.0f;
+        data[9] = bcm + asin;
+        data[10] = c2 * mcos + cost;
+        data[11] = 0.0f;
+        data[12] = 0.0f;
+        data[13] = 0.0f;
+        data[14] = 0.0f;
+        data[15] = 1.0f;
+    }
+}
+
+/*
+void SkewMatrix(float angle, const Vec3 & v1, const Vec3 & v2) {
+v1.Normalize();
+v2.Normalize();
+
+Vec3 v3;
+v3.Cross(v1, v2);
+v3.Normalize();
+
+// Get skew factor.
+float costheta = Vec3DotProduct(v1, v2);
+float sintheta = Real.Sqrt(1 - costheta * costheta);
+float skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;
+
+// Build orthonormal matrix.
+v1 = FXVector3.Cross(v3, v2);
+v1.Normalize();
+
+Matrix R = Matrix::Identity;
+R[0, 0] = v3.X;	// Not sure this is in the correct order...
+R[1, 0] = v3.Y;
+R[2, 0] = v3.Z;
+R[0, 1] = v1.X;
+R[1, 1] = v1.Y;
+R[2, 1] = v1.Z;
+R[0, 2] = v2.X;
+R[1, 2] = v2.Y;
+R[2, 2] = v2.Z;
+
+// Build skew matrix.
+Matrix S = Matrix::Identity;
+S[2, 1] = -skew;
+
+// Return skew transform.
+return R * S * R.Transpose;	// Not sure this is in the correct order...
+}
+*/
+
+/**
+* Generate rotation matrix for the euler angles. This is the same as computing
+* 3 rotation matrices and multiplying them together in our custom order.
+*
+* @todo Have to recompute this code for our new convention.
+**/
+void RotationMatrix( float yaw, float pitch, float roll ) {
+    float sy = sin(yaw+ToRadian(90));
+    float cy = cos(yaw+ToRadian(90));
+    float sp = sin(pitch-ToRadian(90));
+    float cp = cos(pitch-ToRadian(90));
+    float sr = sin(roll);
+    float cr = cos(roll);
+
+    data[0] = cr*cy + sr*sp*sy;
+    data[1] = cp*sy;
+    data[2] = -sr*cy + cr*sp*sy;
+    data[3] = 0;
+
+    data[4] = -cr*sy + sr*sp*cy;
+    data[5] = cp*cy;
+    data[6] = sr*sy + cr*sp*cy;
+    data[7] = 0;
+
+    data[8] = sr*cp;
+    data[9] = -sp;
+    data[10] = cr*cp;
+    data[11] = 0;
+
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = 0;
+    data[15] = 1;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void Frustum( float xmin, float xmax, float ymin, float ymax, float zNear, float zFar ) {
+    float one_deltax, one_deltay, one_deltaz, doubleznear;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    one_deltaz = 1.0f / (zFar - zNear);
+
+    data[0] = (float)(doubleznear * one_deltax);
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+    data[4] = 0.0f;
+    data[5] = (float)(doubleznear * one_deltay);
+    data[6] = 0.f;
+    data[7] = 0.f;
+    data[8] = (float)((xmax + xmin) * one_deltax);
+    data[9] = (float)((ymax + ymin) * one_deltay);
+    data[10] = (float)(-(zFar + zNear) * one_deltaz);
+    data[11] = -1.f;
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = (float)(-(zFar * doubleznear) * one_deltaz);
+    data[15] = 0.f;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void FrustumInf( float xmin, float xmax, float ymin, float ymax, float zNear ) {
+    float one_deltax, one_deltay, doubleznear, nudge;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    nudge = 1.0; // 0.999;
+
+    data[0] = doubleznear * one_deltax;
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+
+    data[4] = 0.0f;
+    data[5] = doubleznear * one_deltay;
+    data[6] = 0.f;
+    data[7] = 0.f;
+
+    data[8] = (xmax + xmin) * one_deltax;
+    data[9] = (ymax + ymin) * one_deltay;
+    data[10] = -1.0f * nudge;
+    data[11] = -1.0f;
+
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = -doubleznear * nudge;
+    data[15] = 0.f;
+}
+
+/** Create an inverse frustum matrix with the far plane at the infinity. */
+void FrustumInfInv( float left, float right, float bottom, float top, float zNear ) {
+    // this matrix is wrong (not tested floatly) I think it should be transposed.
+    data[0] = (right - left) / (2 * zNear);
+    data[1] = 0;
+    data[2] = 0;
+    data[3] = (right + left) / (2 * zNear);
+    data[4] = 0;
+    data[5] = (top - bottom) / (2 * zNear);
+    data[6] = 0;
+    data[7] = (top + bottom) / (2 * zNear);
+    data[8] = 0;
+    data[9] = 0;
+    data[10] = 0;
+    data[11] = -1;
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = -1 / (2 * zNear);
+    data[15] = 1 / (2 * zNear);
+}
+
+/** Create an homogeneous projection matrix. */
+void Perspective( float fov, float aspect, float zNear, float zFar ) {
+    float xmin, xmax, ymin, ymax;
+
+    xmax = zNear * tan( fov/2 );
+    xmin = -xmax;
+
+    ymax = xmax / aspect;
+    ymin = -ymax;
+
+    Frustum(xmin, xmax, ymin, ymax, zNear, zFar);
+}
+
+/** Create a projection matrix with the far plane at the infinity. */
+void PerspectiveInf( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInf( -x, x, -y, y, zNear );
+}
+
+/** Create an inverse projection matrix with far plane at the infinity. */
+void PerspectiveInfInv( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInfInv( -x, x, -y, y, zNear );
+}
+
+/** Build bone matrix from quatertion and offset. */
+void BoneMatrix(const Quat & q, const Vec3 & offset) {
+    float x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;
+
+    // calculate coefficients
+    x2 = q.x + q.x;
+    y2 = q.y + q.y;
+    z2 = q.z + q.z;
+
+    xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;
+    yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;
+    wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;
+
+    data[0] = 1.0f - (yy + zz); 	
+    data[1] = xy - wz;
+    data[2] = xz + wy;		
+    data[3] = 0.0f;
+
+    data[4] = xy + wz;		
+    data[5] = 1.0f - (xx + zz);
+    data[6] = yz - wx;		
+    data[7] = 0.0f;
+
+    data[8] = xz - wy;		
+    data[9] = yz + wx;
+    data[10] = 1.0f - (xx + yy);		
+    data[11] = 0.0f;
+
+    data[12] = offset.x;
+    data[13] = offset.y;
+    data[14] = offset.z;			
+    data[15] = 1.0f;
+}
+
+//@}
+
+
+/** @name Transformations: */
+//@{
+
+/** Apply a general scale. */
+void Scale( float x, float y, float z ) {
+    data[0] *= x;	data[4] *= y;	data[8]  *= z;
+    data[1] *= x;	data[5] *= y;	data[9]  *= z;
+    data[2] *= x;	data[6] *= y;	data[10] *= z;
+    data[3] *= x;	data[7] *= y;	data[11] *= z;
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, const Vec3 & v ) {
+    Matrix b;
+    b.RotationMatrix( theta, v[0], v[1], v[2] );
+    Multiply4x3( b );
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, float v0, float v1, float v2 ) {
+    Matrix b;
+    b.RotationMatrix( theta, v0, v1, v2 );
+    Multiply4x3( b );
+}
+
+/**
+* Translate the matrix by t. This is the same as multiplying by a
+* translation matrix with the given offset.
+* this = T * this
+*/
+void Translate( const Vec3 &t ) {
+    data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];
+    data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];
+    data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];
+    data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];
+}
+
+/** 
+* Translate the matrix by x, y, z. This is the same as multiplying by a 
+* translation matrix with the given offsets.
+*/
+void Translate( float x, float y, float z ) {
+    data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];
+    data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];
+    data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];
+    data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];
+}
+
+/** Compute the transposed matrix. */
+void Transpose() {
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+    piSwap(data[3], data[12]);
+    piSwap(data[7], data[13]);
+    piSwap(data[11], data[14]);
+}
+
+/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */
+void IsometryInverse() {
+    // transposed 3x3 upper left matrix
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+
+    // translate by the negative offsets
+    Vec3 v(-data[12], -data[13], -data[14]);
+    data[12] = data[13] = data[14] = 0;
+    Translate(v);
+}
+
+/** Compute the inverse of the affine portion of this matrix. */
+void AffineInverse() {
+    data[12] = data[13] = data[14] = 0;
+    Transpose();
+}
+//@}
+
+/** @name Matrix operations: */
+//@{
+
+/** Return the determinant of this matrix. */
+float Determinant() const {
+    return	data[0] * data[5] * data[10] * data[15] + 
+        data[1] * data[6] * data[11] * data[12] +
+        data[2] * data[7] * data[ 8] * data[13] +
+        data[3] * data[4] * data[ 9] * data[14] -
+        data[3] * data[6] * data[ 9] * data[12] -
+        data[2] * data[5] * data[ 8] * data[15] -
+        data[1] * data[4] * data[11] * data[14] -
+        data[0] * data[7] * data[10] * data[12];
+}
+
+
+/** Standard matrix product: this *= B. */
+void Multiply4x4( const Matrix & restrict B ) {
+    Multiply4x4(*this, B);
+}
+
+/** Standard matrix product: this = A * B. this != B*/
+void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 4; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+
+    /* Unrolled but does not allow this == A
+    data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];
+    data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];
+    data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];
+    data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];
+    data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];
+    data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];
+    data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];
+    data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];
+    data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];
+    data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];
+    data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];
+    data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];
+    data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];
+    data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];
+    data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];
+    data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];
+    */
+}
+
+/** Standard matrix product: this *= B. */
+void Multiply4x3( const Matrix & restrict B ) {
+    Multiply4x3(*this, B);
+}
+
+/** Standard product of matrices, where the last row is [0 0 0 1]. */
+void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 3; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+    data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;
+
+    /* Unrolled but does not allow this == A
+    data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];
+    data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];
+    data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];
+    data[3] = 0.0f;
+    data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];
+    data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];
+    data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];
+    data[7] = 0.0f;
+    data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];
+    data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];
+    data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];
+    data[11]= 0.0f;
+    data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];
+    data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];
+    data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];
+    data[15]= 1.0f;
+    */
+}
+//@}
+
+
+/** @name Vector operations: */
+//@{
+
+/** Transform 3d vector (w=0). */
+void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];
+}
+/** Transform 3d vector by the transpose (w=0). */
+void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];
+    dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];
+    dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];
+}
+
+/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */
+void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+}
+
+/** Transform a point, normalize it, and return w. */
+float TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    float w;
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);
+    *dest *= w;
+    return w;
+}
+
+/** Transform a point and return w. */
+float TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+
+/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */
+void TransformVec4(const Vec3 & orig, Vec4 * dest) const {
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+//@}
+
+/** @name Matrix analysis. */
+//@{
+
+/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */
+void GetEulerAnglesZYZ(float * s, float * t, float * r) const {
+    if( GetElem(2,2) < 1.0f ) {
+        if( GetElem(2,2) > -1.0f ) {
+            // 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr
+            //	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr
+            //	-cs*st				ss*st				ct
+            *s = atan2(GetElem(1,2), -GetElem(0,2));
+            *t = acos(GetElem(2,2));
+            *r = atan2(GetElem(2,1), GetElem(2,0));		
+        }
+        else {
+            // 	-c(s-r)	 	s(s-r)		0
+            //	s(s-r)		c(s-r)		0
+            //	0			0			-1
+            *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r
+            *t = PI;
+            *r = 0;
+        }
+    }
+    else {
+        // 	c(s+r)		-s(s+r)		0
+        //	s(s+r)		c(s+r)		0
+        //	0			0			1
+        *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r
+        *t = 0;
+        *r = 0;
+    }
+}
+
+//@}
+
+MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );
+
+/** Print to debug output. */
+void Print() const {
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );
+}
+
+
+public:
+
+    float data[16];
+
+};
+#endif
+
+
+#endif // NV_MATH_MATRIX_INL
diff --git a/thirdparty/thekla_atlas/nvmath/Morton.h b/thirdparty/thekla_atlas/nvmath/Morton.h
new file mode 100644
index 0000000000..10e0d8152a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Morton.h
@@ -0,0 +1,83 @@
+
+// Code from ryg:
+// http://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+
+
+// "Insert" a 0 bit after each of the 16 low bits of x
+inline uint32 part1By1(uint32 x)
+{
+	x &= 0x0000ffff;                  // x = ---- ---- ---- ---- fedc ba98 7654 3210
+	x = (x ^ (x <<  8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+	x = (x ^ (x <<  4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+	x = (x ^ (x <<  2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+	x = (x ^ (x <<  1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+	return x;
+}
+
+// "Insert" two 0 bits after each of the 10 low bits of x
+inline uint32 part1By2(uint32 x)
+{
+	x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
+	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x <<  8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x <<  4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x <<  2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	return x;
+}
+
+inline uint32 encodeMorton2(uint32 x, uint32 y)
+{
+	return (part1By1(y) << 1) + part1By1(x);
+}
+
+inline uint32 encodeMorton3(uint32 x, uint32 y, uint32 z)
+{
+	return (part1By2(z) << 2) + (part1By2(y) << 1) + part1By2(x);
+}
+
+// Inverse of part1By1 - "delete" all odd-indexed bits
+inline uint32 compact1By1(uint32 x)
+{
+	x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
+	x = (x ^ (x >>  1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
+	x = (x ^ (x >>  2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
+	x = (x ^ (x >>  4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
+	x = (x ^ (x >>  8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
+	return x;
+}
+
+// Inverse of part1By2 - "delete" all bits not at positions divisible by 3
+inline uint32 compact1By2(uint32 x)
+{
+	x &= 0x09249249;                  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	x = (x ^ (x >>  2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x >>  4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x >>  8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
+	return x;
+}
+
+inline uint32 decodeMorton2X(uint32 code)
+{
+	return compact1By1(code >> 0);
+}
+
+inline uint32 decodeMorton2Y(uint32 code)
+{
+	return compact1By1(code >> 1);
+}
+
+inline uint32 decodeMorton3X(uint32 code)
+{
+	return compact1By2(code >> 0);
+}
+
+inline uint32 decodeMorton3Y(uint32 code)
+{
+	return compact1By2(code >> 1);
+}
+
+inline uint32 decodeMorton3Z(uint32 code)
+{
+	return compact1By2(code >> 2);
+}
+\ No newline at end of file
diff --git a/thirdparty/thekla_atlas/nvmath/Plane.cpp b/thirdparty/thekla_atlas/nvmath/Plane.cpp
new file mode 100644
index 0000000000..8b54f829ad
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Plane.cpp
@@ -0,0 +1,27 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Plane.h"
+#include "Plane.inl"
+#include "Matrix.inl"
+
+namespace nv
+{
+    Plane transformPlane(const Matrix & m, const Plane & p)
+    {
+        Vector3 newVec = transformVector(m, p.vector());
+
+        Vector3 ptInPlane = p.offset() * p.vector();
+        ptInPlane = transformPoint(m, ptInPlane);
+
+        return Plane(newVec, ptInPlane);
+    }
+
+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c)
+    {
+        return dot(a.vector(), cross(b.vector(), c.vector())) * (
+            a.offset() * cross(b.vector(), c.vector()) + 
+            c.offset() * cross(a.vector(), b.vector()) +
+            b.offset() * cross(c.vector(), a.vector()));
+    }
+
+} // nv namespace
diff --git a/thirdparty/thekla_atlas/nvmath/Plane.h b/thirdparty/thekla_atlas/nvmath/Plane.h
new file mode 100644
index 0000000000..dc468b28e2
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Plane.h
@@ -0,0 +1,42 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_PLANE_H
+#define NV_MATH_PLANE_H
+
+#include "nvmath.h"
+#include "Vector.h"
+
+namespace nv
+{
+    class Matrix;
+
+    class NVMATH_CLASS Plane
+    {
+    public:
+        Plane();
+        Plane(float x, float y, float z, float w);
+        Plane(const Vector4 & v);
+        Plane(const Vector3 & v, float d);
+        Plane(const Vector3 & normal, const Vector3 & point);
+        Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2);
+
+        const Plane & operator=(const Plane & v);
+
+        Vector3 vector() const;
+        float offset() const;
+        Vector3 normal() const;
+
+        void operator*=(float s);
+
+        Vector4 v;
+    };
+
+    Plane transformPlane(const Matrix &, const Plane &);
+
+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c);
+
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H
diff --git a/thirdparty/thekla_atlas/nvmath/Plane.inl b/thirdparty/thekla_atlas/nvmath/Plane.inl
new file mode 100644
index 0000000000..2277e38cd5
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Plane.inl
@@ -0,0 +1,50 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_PLANE_INL
+#define NV_MATH_PLANE_INL
+
+#include "Plane.h"
+#include "Vector.inl"
+
+namespace nv
+{
+    inline Plane::Plane() {}
+    inline Plane::Plane(float x, float y, float z, float w) : v(x, y, z, w) {}
+    inline Plane::Plane(const Vector4 & v) : v(v) {}
+    inline Plane::Plane(const Vector3 & v, float d) : v(v, d) {}
+    inline Plane::Plane(const Vector3 & normal, const Vector3 & point) : v(normal, -dot(normal, point)) {}
+    inline Plane::Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2) {
+        Vector3 n = cross(v1-v0, v2-v0);
+        float d = -dot(n, v0);
+        v = Vector4(n, d);
+    }
+
+    inline const Plane & Plane::operator=(const Plane & p) { v = p.v; return *this; }
+
+    inline Vector3 Plane::vector() const { return v.xyz(); }
+    inline float Plane::offset() const { return v.w; }
+    inline Vector3 Plane::normal() const { return normalize(vector(), 0.0f); }
+
+    // Normalize plane.
+    inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON)
+    {
+        const float len = length(plane.vector());
+        const float inv = isZero(len, epsilon) ? 0 : 1.0f / len;
+        return Plane(plane.v * inv);
+    }
+
+    // Get the signed distance from the given point to this plane.
+    inline float distance(const Plane & plane, const Vector3 & point)
+    {
+        return dot(plane.vector(), point) + plane.offset();
+    }
+
+    inline void Plane::operator*=(float s)
+    {
+        v *= s;
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H
diff --git a/thirdparty/thekla_atlas/nvmath/ProximityGrid.cpp b/thirdparty/thekla_atlas/nvmath/ProximityGrid.cpp
new file mode 100644
index 0000000000..3553e48f64
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/ProximityGrid.cpp
@@ -0,0 +1,158 @@
+#include "ProximityGrid.h"
+
+#include "Box.inl"
+#include "Morton.h"
+
+
+using namespace nv;
+
+ProximityGrid::ProximityGrid() {
+}
+
+void ProximityGrid::reset() {
+    cellArray.clear();
+}
+
+void ProximityGrid::init(const Array<Vector3> & pointArray) {
+
+	// Compute bounding box.
+    Box box;
+	box.clearBounds();
+	
+    const uint count = pointArray.count();
+
+    for (uint i = 0; i < count; i++) {
+		box.addPointToBounds(pointArray[i]);
+	}
+
+    init(box, count);
+
+	// Insert all points.
+	for (uint i = 0; i < count; i++) {
+        add(pointArray[i], i);
+    }
+}
+
+
+void ProximityGrid::init(const Box & box, uint count) {
+    reset();
+ 
+    // Determine grid size.
+    float cellWidth;
+
+    Vector3 diagonal = box.extents() * 2.f;
+    float volume = box.volume();
+
+    if (equal(volume, 0)) {
+        // Degenerate box, treat like a quad.
+        Vector2 quad;
+        if (diagonal.x < diagonal.y && diagonal.x < diagonal.z) {
+            quad.x = diagonal.y;
+            quad.y = diagonal.z;
+        }
+        else if (diagonal.y < diagonal.x && diagonal.y < diagonal.z) {
+            quad.x = diagonal.x;
+            quad.y = diagonal.z;
+        }
+        else {
+            quad.x = diagonal.x;
+            quad.y = diagonal.y;
+        }
+
+        float cellArea = quad.x * quad.y / count;
+        cellWidth = sqrtf(cellArea); // pow(cellArea, 1.0f / 2.0f);
+    }
+    else {
+        // Ideally we want one cell per point.
+        float cellVolume = volume / count;
+        cellWidth = pow(cellVolume, 1.0f / 3.0f);
+    }
+
+    nvDebugCheck(cellWidth != 0);
+
+    sx = max(1, ftoi_ceil(diagonal.x / cellWidth));
+    sy = max(1, ftoi_ceil(diagonal.y / cellWidth));
+    sz = max(1, ftoi_ceil(diagonal.z / cellWidth));
+
+    invCellSize.x = float(sx) / diagonal.x;
+    invCellSize.y = float(sy) / diagonal.y;
+    invCellSize.z = float(sz) / diagonal.z;
+
+	cellArray.resize(sx * sy * sz);
+
+    corner = box.minCorner; // @@ Align grid better?
+}
+
+// Gather all points inside the given sphere.
+// Radius is assumed to be small, so we don't bother culling the cells.
+void ProximityGrid::gather(const Vector3 & position, float radius, Array<uint> & indexArray) {
+    int x0 = index_x(position.x - radius);
+    int x1 = index_x(position.x + radius);
+
+    int y0 = index_y(position.y - radius);
+    int y1 = index_y(position.y + radius);
+
+    int z0 = index_z(position.z - radius);
+    int z1 = index_z(position.z + radius);
+
+    for (int z = z0; z <= z1; z++) {
+        for (int y = y0; y <= y1; y++) {
+            for (int x = x0; x <= x1; x++) {
+                int idx = index(x, y, z);
+                indexArray.append(cellArray[idx].indexArray);
+            }
+        }
+    }
+}
+
+
+uint32 ProximityGrid::mortonCount() const {
+    uint64 s = U64(max3(sx, sy, sz));
+    s = nextPowerOfTwo(s);
+    
+    if (s > 1024) {
+        return U32(s * s * min3(sx, sy, sz));
+    }
+
+    return U32(s * s * s);
+}
+
+int ProximityGrid::mortonIndex(uint32 code) const {
+    uint32 x, y, z;
+
+    uint s = U32(max3(sx, sy, sz));
+    if (s > 1024) {
+        // Use layered two-dimensional morton order.
+        s = nextPowerOfTwo(s);
+        uint layer = code / (s * s);
+        code = code % (s * s);
+
+        uint layer_count = U32(min3(sx, sy, sz));
+        if (sx == layer_count) {
+            x = layer;
+            y = decodeMorton2X(code);
+            z = decodeMorton2Y(code);
+        }
+        else if (sy == layer_count) {
+            x = decodeMorton2Y(code); 
+            y = layer; 
+            z = decodeMorton2X(code);
+        }
+        else /*if (sz == layer_count)*/ {
+            x = decodeMorton2X(code);
+            y = decodeMorton2Y(code);
+            z = layer;
+        }
+    }
+    else {
+        x = decodeMorton3X(code);
+        y = decodeMorton3Y(code);
+        z = decodeMorton3Z(code);
+    }
+
+    if (x >= U32(sx) || y >= U32(sy) || z >= U32(sz)) {
+        return -1;
+    }
+
+    return index(x, y, z);
+}
diff --git a/thirdparty/thekla_atlas/nvmath/ProximityGrid.h b/thirdparty/thekla_atlas/nvmath/ProximityGrid.h
new file mode 100644
index 0000000000..a21bb3bd68
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/ProximityGrid.h
@@ -0,0 +1,99 @@
+#pragma once
+#ifndef NV_MATH_PROXIMITYGRID_H
+#define NV_MATH_PROXIMITYGRID_H
+
+#include "Vector.h"
+#include "ftoi.h"
+
+#include "nvcore/Array.inl"
+
+
+// A simple, dynamic proximity grid based on Jon's code.
+// Instead of storing pointers here I store indices.
+
+namespace nv {
+
+    class Box;
+
+    struct Cell {
+        Array<uint> indexArray;
+    };
+
+    struct ProximityGrid {
+        ProximityGrid();
+
+        void reset();
+        void init(const Array<Vector3> & pointArray);
+        void init(const Box & box, uint count);
+
+        int index_x(float x) const;
+        int index_y(float y) const;
+        int index_z(float z) const;
+        int index(int x, int y, int z) const;
+        int index(const Vector3 & pos) const;
+        
+        uint32 mortonCount() const;
+        int mortonIndex(uint32 code) const;
+
+        void add(const Vector3 & pos, uint key);
+        bool remove(const Vector3 & pos, uint key);
+
+        void gather(const Vector3 & pos, float radius, Array<uint> & indices);
+
+        Array<Cell> cellArray;
+
+        Vector3 corner;
+        Vector3 invCellSize;
+        int sx, sy, sz;
+    };
+
+    // For morton traversal, do:
+    // for (int code = 0; code < mortonCount(); code++) {
+    //   int idx = mortonIndex(code);
+    //   if (idx < 0) continue;
+    // }
+
+
+
+    inline int ProximityGrid::index_x(float x) const {
+        return clamp(ftoi_floor((x - corner.x) * invCellSize.x),  0, sx-1);
+    }
+
+    inline int ProximityGrid::index_y(float y) const {
+        return clamp(ftoi_floor((y - corner.y) * invCellSize.y),  0, sy-1);
+    }
+
+    inline int ProximityGrid::index_z(float z) const {
+        return clamp(ftoi_floor((z - corner.z) * invCellSize.z),  0, sz-1);
+    }
+
+    inline int ProximityGrid::index(int x, int y, int z) const {
+        nvDebugCheck(x >= 0 && x < sx);
+        nvDebugCheck(y >= 0 && y < sy);
+        nvDebugCheck(z >= 0 && z < sz);
+        int idx = (z * sy + y) * sx + x;
+        nvDebugCheck(idx >= 0 && uint(idx) < cellArray.count());
+        return idx;
+    }
+
+    inline int ProximityGrid::index(const Vector3 & pos) const {
+        int x = index_x(pos.x);
+        int y = index_y(pos.y);
+        int z = index_z(pos.z);
+        return index(x, y, z);
+    }
+
+
+    inline void ProximityGrid::add(const Vector3 & pos, uint key) {
+        uint idx = index(pos);
+        cellArray[idx].indexArray.append(key);
+    }
+
+    inline bool ProximityGrid::remove(const Vector3 & pos, uint key) {
+        uint idx = index(pos);
+        return cellArray[idx].indexArray.remove(key);
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_PROXIMITYGRID_H
diff --git a/thirdparty/thekla_atlas/nvmath/Quaternion.h b/thirdparty/thekla_atlas/nvmath/Quaternion.h
new file mode 100644
index 0000000000..dc5219e5e4
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Quaternion.h
@@ -0,0 +1,213 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MATH_QUATERNION_H
+#define NV_MATH_QUATERNION_H
+
+#include "nvmath/nvmath.h"
+#include "nvmath/Vector.inl" // @@ Do not include inl files from header files.
+#include "nvmath/Matrix.h"
+
+namespace nv
+{
+
+    class NVMATH_CLASS Quaternion
+    {
+    public:
+        typedef Quaternion const & Arg;
+
+        Quaternion();
+        explicit Quaternion(float f);
+        Quaternion(float x, float y, float z, float w);
+        Quaternion(Vector4::Arg v);
+
+        const Quaternion & operator=(Quaternion::Arg v);
+
+        Vector4 asVector() const;
+
+        union {
+            struct {
+                float x, y, z, w;
+            };
+            float component[4];
+        };
+    };
+
+    inline Quaternion::Quaternion() {}
+    inline Quaternion::Quaternion(float f) : x(f), y(f), z(f), w(f) {}
+    inline Quaternion::Quaternion(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+    inline Quaternion::Quaternion(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+
+    // @@ Move all these to Quaternion.inl!
+
+    inline const Quaternion & Quaternion::operator=(Quaternion::Arg v) { 
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        w = v.w;
+        return *this;
+    }
+
+    inline Vector4 Quaternion::asVector() const { return Vector4(x, y, z, w); }
+
+    inline Quaternion mul(Quaternion::Arg a, Quaternion::Arg b)
+    {
+        return Quaternion(
+            + a.x*b.w + a.y*b.z - a.z*b.y + a.w*b.x,
+            - a.x*b.z + a.y*b.w + a.z*b.x + a.w*b.y,
+            + a.x*b.y - a.y*b.x + a.z*b.w + a.w*b.z,
+            - a.x*b.x - a.y*b.y - a.z*b.z + a.w*b.w);
+    }
+
+    inline Quaternion mul(Quaternion::Arg a, Vector3::Arg b)
+    {
+        return Quaternion(
+            + a.y*b.z - a.z*b.y + a.w*b.x,
+            - a.x*b.z           + a.z*b.x + a.w*b.y,
+            + a.x*b.y - a.y*b.x           + a.w*b.z,
+            - a.x*b.x - a.y*b.y - a.z*b.z );
+    }
+
+    inline Quaternion mul(Vector3::Arg a, Quaternion::Arg b)
+    {
+        return Quaternion(
+            + a.x*b.w + a.y*b.z - a.z*b.y,
+            - a.x*b.z + a.y*b.w + a.z*b.x,
+            + a.x*b.y - a.y*b.x + a.z*b.w,
+            - a.x*b.x - a.y*b.y - a.z*b.z);
+    }
+
+    inline Quaternion operator *(Quaternion::Arg a, Quaternion::Arg b)
+    {
+        return mul(a, b);
+    }
+
+    inline Quaternion operator *(Quaternion::Arg a, Vector3::Arg b)
+    {
+        return mul(a, b);
+    }
+
+    inline Quaternion operator *(Vector3::Arg a, Quaternion::Arg b)
+    {
+        return mul(a, b);
+    }
+
+
+    inline Quaternion scale(Quaternion::Arg q, float s)
+    {
+        return scale(q.asVector(), s);
+    }
+    inline Quaternion operator *(Quaternion::Arg q, float s)
+    {
+        return scale(q, s);
+    }
+    inline Quaternion operator *(float s, Quaternion::Arg q)
+    {
+        return scale(q, s);
+    }
+
+    inline Quaternion scale(Quaternion::Arg q, Vector4::Arg s)
+    {
+        return scale(q.asVector(), s);
+    }
+    /*inline Quaternion operator *(Quaternion::Arg q, Vector4::Arg s)
+    {
+    return scale(q, s);
+    }
+    inline Quaternion operator *(Vector4::Arg s, Quaternion::Arg q)
+    {
+    return scale(q, s);
+    }*/
+
+    inline Quaternion conjugate(Quaternion::Arg q)
+    {
+        return scale(q, Vector4(-1, -1, -1, 1));
+    }
+
+    inline float length(Quaternion::Arg q)
+    {
+        return length(q.asVector());
+    }
+
+    inline bool isNormalized(Quaternion::Arg q, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(q), 1, epsilon);
+    }
+
+    inline Quaternion normalize(Quaternion::Arg q, float epsilon = NV_EPSILON)
+    {
+        float l = length(q);
+        nvDebugCheck(!isZero(l, epsilon));
+        Quaternion n = scale(q, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Quaternion inverse(Quaternion::Arg q)
+    {
+        return conjugate(normalize(q));
+    }
+
+    /// Create a rotation quaternion for @a angle alpha around normal vector @a v.
+    inline Quaternion axisAngle(Vector3::Arg v, float alpha)
+    {
+        float s = sinf(alpha * 0.5f);
+        float c = cosf(alpha * 0.5f);
+        return Quaternion(Vector4(v * s, c));
+    }
+
+    inline Vector3 imag(Quaternion::Arg q)
+    {
+        return q.asVector().xyz();
+    }
+
+    inline float real(Quaternion::Arg q)
+    {
+        return q.w;
+    }
+
+
+    /// Transform vector.
+    inline Vector3 transform(Quaternion::Arg q, Vector3::Arg v)
+    {
+        //Quaternion t = q * v * conjugate(q);
+        //return imag(t);
+
+        // Faster method by Fabian Giesen and others:
+        // http://molecularmusings.wordpress.com/2013/05/24/a-faster-quaternion-vector-multiplication/
+        // http://mollyrocket.com/forums/viewtopic.php?t=833&sid=3a84e00a70ccb046cfc87ac39881a3d0
+        
+        Vector3 t = 2 * cross(imag(q), v);
+        return v + q.w * t + cross(imag(q), t);
+    }
+
+    // @@ Not tested.
+    // From Insomniac's Mike Day:
+    // http://www.insomniacgames.com/converting-a-rotation-matrix-to-a-quaternion/
+    inline Quaternion fromMatrix(const Matrix & m) {
+        if (m(2, 2) < 0) {
+            if (m(0, 0) < m(1,1)) {
+                float t = 1 - m(0, 0) - m(1, 1) - m(2, 2);
+                return Quaternion(t, m(0,1)+m(1,0), m(2,0)+m(0,2), m(1,2)-m(2,1));
+            }
+            else {
+                float t = 1 - m(0, 0) + m(1, 1) - m(2, 2);
+                return Quaternion(t, m(0,1) + m(1,0), m(1,2) + m(2,1), m(2,0) - m(0,2));
+            }
+        }
+        else {
+            if (m(0, 0) < -m(1, 1)) {
+                float t = 1 - m(0, 0) - m(1, 1) + m(2, 2);
+                return Quaternion(t, m(2,0) + m(0,2), m(1,2) + m(2,1), m(0,1) - m(1,0));
+            }
+            else {
+                float t = 1 + m(0, 0) + m(1, 1) + m(2, 2);
+                return Quaternion(t, m(1,2) - m(2,1), m(2,0) - m(0,2), m(0,1) - m(1,0));
+            }
+        }
+    }
+
+
+} // nv namespace
+
+#endif // NV_MATH_QUATERNION_H
diff --git a/thirdparty/thekla_atlas/nvmath/Random.cpp b/thirdparty/thekla_atlas/nvmath/Random.cpp
new file mode 100644
index 0000000000..1a60e7f5e7
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Random.cpp
@@ -0,0 +1,54 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvmath/Random.h>
+#include <time.h>
+
+using namespace nv;
+
+// Statics
+const uint16 Rand48::a0 = 0xE66D; 
+const uint16 Rand48::a1 = 0xDEEC; 
+const uint16 Rand48::a2 = 0x0005;
+const uint16 Rand48::c0 = 0x000B;
+
+
+/// Get a random seed based on the current time.
+uint Rand::randomSeed()
+{
+    return (uint)time(NULL);
+}
+
+
+void MTRand::initialize( uint32 seed )
+{
+    // Initialize generator state with seed
+    // See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier.
+    // In previous versions, most significant bits (MSBs) of the seed affect
+    // only MSBs of the state array.  Modified 9 Jan 2002 by Makoto Matsumoto.
+    uint32 *s = state;
+    uint32 *r = state;
+    int i = 1;
+    *s++ = seed & 0xffffffffUL;
+    for( ; i < N; ++i )
+    {
+        *s++ = ( 1812433253UL * ( *r ^ (*r >> 30) ) + i ) & 0xffffffffUL;
+        r++;
+    }
+}
+
+
+void MTRand::reload()
+{
+    // Generate N new values in state
+    // Made clearer and faster by Matthew Bellew (matthew.bellew@home.com)
+    uint32 *p = state;
+    int i;
+    for( i = N - M; i--; ++p )
+        *p = twist( p[M], p[0], p[1] );
+    for( i = M; --i; ++p )
+        *p = twist( p[M-N], p[0], p[1] );
+    *p = twist( p[M-N], p[0], state[0] );
+
+    left = N, next = state;
+}
+
diff --git a/thirdparty/thekla_atlas/nvmath/Random.h b/thirdparty/thekla_atlas/nvmath/Random.h
new file mode 100644
index 0000000000..223292706a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Random.h
@@ -0,0 +1,376 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_RANDOM_H
+#define NV_MATH_RANDOM_H
+
+#include "nvmath.h"
+#include "nvcore/Utils.h" // nextPowerOfTwo
+
+
+namespace nv
+{
+
+    /// Interface of the random number generators.
+    class Rand
+    {
+    public:
+
+        virtual ~Rand() {}
+
+        enum time_e { Time };
+
+        /// Provide a new seed.
+        virtual void seed( uint s ) { /* empty */ };
+
+        /// Get an integer random number.
+        virtual uint get() = 0;
+
+        /// Get a random number on [0, max] interval.
+        uint getRange( uint max )
+        {
+            if (max == 0) return 0;
+            if (max == NV_UINT32_MAX) return get();
+
+            const uint np2 = nextPowerOfTwo( max+1 ); // @@ This fails if max == NV_UINT32_MAX
+            const uint mask = np2 - 1;
+            uint n;
+            do { n = get() & mask; } while( n > max );
+            return n;
+        }
+
+        /// Random number on [0.0, 1.0] interval.
+        float getFloat()
+        {
+            union
+            {
+                uint32 i;
+                float f;
+            } pun;
+
+            pun.i = 0x3f800000UL | (get() & 0x007fffffUL);
+            return pun.f - 1.0f;
+        }
+
+        float getFloatRange(float min, float max) {
+            return getFloat() * (max - min) + min;
+        }
+
+        /*
+        /// Random number on [0.0, 1.0] interval.
+        double getReal()
+        {
+        return double(get()) * (1.0/4294967295.0); // 2^32-1
+        }
+
+        /// Random number on [0.0, 1.0) interval.
+        double getRealExclusive()
+        {
+        return double(get()) * (1.0/4294967296.0); // 2^32
+        }
+        */
+
+        /// Get the max value of the random number.
+        uint max() const { return NV_UINT32_MAX; }
+
+        // Get a random seed.
+        static uint randomSeed();
+
+    };
+
+
+    /// Very simple random number generator with low storage requirements.
+    class SimpleRand : public Rand
+    {
+    public:
+
+        /// Constructor that uses the current time as the seed.
+        SimpleRand( time_e )
+        {
+            seed(randomSeed());
+        }
+
+        /// Constructor that uses the given seed.
+        SimpleRand( uint s = 0 )
+        {
+            seed(s);
+        }
+
+        /// Set the given seed.
+        virtual void seed( uint s )
+        {
+            current = s;
+        }
+
+        /// Get a random number.
+        virtual uint get()
+        {
+            return current = current * 1103515245 + 12345;
+        }
+
+    private:
+
+        uint current;
+
+    };
+
+
+    /// Mersenne twister random number generator.
+    class MTRand : public Rand
+    {
+    public:
+
+        enum { N = 624 };       // length of state vector
+        enum { M = 397 };
+
+        /// Constructor that uses the current time as the seed.
+        MTRand( time_e )
+        {
+            seed(randomSeed());
+        }
+
+        /// Constructor that uses the given seed.
+        MTRand( uint s = 0 )
+        {
+            seed(s);
+        }
+
+        /// Constructor that uses the given seeds.
+        NVMATH_API MTRand( const uint * seed_array, uint length );
+
+
+        /// Provide a new seed.
+        virtual void seed( uint s )
+        {
+            initialize(s);
+            reload();
+        }	
+
+        /// Get a random number between 0 - 65536.
+        virtual uint get()
+        {
+            // Pull a 32-bit integer from the generator state
+            // Every other access function simply transforms the numbers extracted here
+            if( left == 0 ) { 
+                reload(); 
+            }
+            left--;
+
+            uint s1;
+            s1 = *next++;
+            s1 ^= (s1 >> 11);
+            s1 ^= (s1 <<  7) & 0x9d2c5680U;
+            s1 ^= (s1 << 15) & 0xefc60000U;
+            return ( s1 ^ (s1 >> 18) );		
+        };
+
+
+    private:
+
+        NVMATH_API void initialize( uint32 seed );
+        NVMATH_API void reload();
+
+        uint hiBit( uint u ) const { return u & 0x80000000U; }
+        uint loBit( uint u ) const { return u & 0x00000001U; }
+        uint loBits( uint u ) const { return u & 0x7fffffffU; }
+        uint mixBits( uint u, uint v ) const { return hiBit(u) | loBits(v); }
+        uint twist( uint m, uint s0, uint s1 ) const { return m ^ (mixBits(s0,s1)>>1) ^ ((~loBit(s1)+1) & 0x9908b0dfU); }
+
+    private:
+
+        uint state[N];	// internal state
+        uint * next;	// next value to get from state
+        int left;		// number of values left before reload needed		
+
+    };
+
+
+
+    /** George Marsaglia's random number generator. 
+    * Code based on Thatcher Ulrich public domain source code:
+    * http://cvs.sourceforge.net/viewcvs.py/tu-testbed/tu-testbed/base/tu_random.cpp?rev=1.7&view=auto
+    *
+    * PRNG code adapted from the complimentary-multiply-with-carry
+    * code in the article: George Marsaglia, "Seeds for Random Number
+    * Generators", Communications of the ACM, May 2003, Vol 46 No 5,
+    * pp90-93.
+    * 
+    * The article says:
+    * 
+    * "Any one of the choices for seed table size and multiplier will
+    * provide a RNG that has passed extensive tests of randomness,
+    * particularly those in [3], yet is simple and fast --
+    * approximately 30 million random 32-bit integers per second on a
+    * 850MHz PC.  The period is a*b^n, where a is the multiplier, n
+    * the size of the seed table and b=2^32-1.  (a is chosen so that
+    * b is a primitive root of the prime a*b^n + 1.)"
+    * 
+    * [3] Marsaglia, G., Zaman, A., and Tsang, W.  Toward a universal
+    * random number generator.  _Statistics and Probability Letters
+    * 8_ (1990), 35-39.
+    */
+    class GMRand : public Rand
+    {
+    public:
+
+        enum { SEED_COUNT = 8 };
+
+        //	const uint64 a = 123471786;		// for SEED_COUNT=1024
+        //	const uint64 a = 123554632;		// for SEED_COUNT=512
+        //	const uint64 a = 8001634;		// for SEED_COUNT=255
+        //	const uint64 a = 8007626;		// for SEED_COUNT=128
+        //	const uint64 a = 647535442;		// for SEED_COUNT=64
+        //	const uint64 a = 547416522;		// for SEED_COUNT=32
+        //	const uint64 a = 487198574;		// for SEED_COUNT=16
+        //	const uint64 a = 716514398U;	// for SEED_COUNT=8
+        enum { a = 716514398U };
+
+
+        GMRand( time_e )
+        {
+            seed(randomSeed());
+        }
+
+        GMRand(uint s = 987654321)
+        {
+            seed(s);
+        }
+
+
+        /// Provide a new seed.
+        virtual void seed( uint s )
+        {
+            c = 362436;
+            i = SEED_COUNT - 1;
+
+            for(int i = 0; i < SEED_COUNT; i++) {
+                s = s ^ (s << 13);
+                s = s ^ (s >> 17);
+                s = s ^ (s << 5);
+                Q[i] = s;
+            }
+        }
+
+        /// Get a random number between 0 - 65536.
+        virtual uint get()
+        {
+            const uint32 r = 0xFFFFFFFE;		
+
+            uint64 t;
+            uint32 x;
+
+            i = (i + 1) & (SEED_COUNT - 1);
+            t = a * Q[i] + c;
+            c = uint32(t >> 32);
+            x = uint32(t + c);
+
+            if( x < c ) {
+                x++;
+                c++;
+            }
+
+            uint32  val = r - x;
+            Q[i] = val;
+            return val;
+        };
+
+
+    private:
+
+        uint32 c;
+        uint32 i;
+        uint32 Q[8];
+
+    };
+
+
+    /** Random number implementation from the GNU Sci. Lib. (GSL).
+    * Adapted from Nicholas Chapman version:
+    * 
+    * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
+    * This is the Unix rand48() generator. The generator returns the
+    * upper 32 bits from each term of the sequence,
+    * 
+    * x_{n+1} = (a x_n + c) mod m 
+    * 
+    * using 48-bit unsigned arithmetic, with a = 0x5DEECE66D , c = 0xB
+    * and m = 2^48. The seed specifies the upper 32 bits of the initial
+    * value, x_1, with the lower 16 bits set to 0x330E.
+    * 
+    * The theoretical value of x_{10001} is 244131582646046.
+    * 
+    * The period of this generator is ? FIXME (probably around 2^48). 
+    */
+    class Rand48 : public Rand
+    {
+    public:
+
+        Rand48( time_e )
+        {
+            seed(randomSeed());
+        }
+
+        Rand48( uint s = 0x1234ABCD )
+        {
+            seed(s);
+        }	
+
+
+        /** Set the given seed. */
+        virtual void seed( uint s ) {
+            vstate.x0 = 0x330E;
+            vstate.x1 = uint16(s & 0xFFFF);
+            vstate.x2 = uint16((s >> 16) & 0xFFFF);
+        }
+
+        /** Get a random number. */
+        virtual uint get() {
+
+            advance();
+
+            uint x1 = vstate.x1;
+            uint x2 = vstate.x2;
+            return (x2 << 16) + x1;
+        }
+
+
+    private:
+
+        void advance()
+        {
+            /* work with unsigned long ints throughout to get correct integer
+            promotions of any unsigned short ints */
+            const uint32 x0 = vstate.x0;
+            const uint32 x1 = vstate.x1;
+            const uint32 x2 = vstate.x2;
+
+            uint32 a;
+            a = a0 * x0 + c0;
+
+            vstate.x0 = uint16(a & 0xFFFF);
+            a >>= 16;
+
+            /* although the next line may overflow we only need the top 16 bits
+            in the following stage, so it does not matter */
+
+            a += a0 * x1 + a1 * x0; 
+            vstate.x1 = uint16(a & 0xFFFF);
+
+            a >>= 16;
+            a += a0 * x2 + a1 * x1 + a2 * x0;
+            vstate.x2 = uint16(a & 0xFFFF);
+        }
+
+
+    private:	
+        NVMATH_API static const uint16 a0, a1, a2, c0;
+
+        struct rand48_state_t { 
+            uint16 x0, x1, x2; 
+        } vstate;
+
+    };
+
+} // nv namespace
+
+#endif // NV_MATH_RANDOM_H
diff --git a/thirdparty/thekla_atlas/nvmath/Solver.cpp b/thirdparty/thekla_atlas/nvmath/Solver.cpp
new file mode 100644
index 0000000000..191793ee29
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Solver.cpp
@@ -0,0 +1,744 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Solver.h"
+#include "Sparse.h"
+
+#include "nvcore/Array.inl"
+
+using namespace nv;
+
+namespace
+{
+    class Preconditioner
+    {
+    public:
+        // Virtual dtor.
+        virtual ~Preconditioner() { }
+
+        // Apply preconditioning step.
+        virtual void apply(const FullVector & x, FullVector & y) const = 0;
+    };
+
+
+    // Jacobi preconditioner.
+    class JacobiPreconditioner : public Preconditioner
+    {
+    public:
+
+        JacobiPreconditioner(const SparseMatrix & M, bool symmetric) : m_inverseDiagonal(M.width())
+        {
+            nvCheck(M.isSquare());
+
+            for(uint x = 0; x < M.width(); x++)
+            {
+                float elem = M.getCoefficient(x, x);
+                //nvDebugCheck( elem != 0.0f ); // This can be zero in the presence of zero area triangles.
+
+                if (symmetric) 
+                {
+                    m_inverseDiagonal[x] = (elem != 0) ? 1.0f / sqrtf(fabsf(elem)) : 1.0f;
+                }
+                else 
+                {
+                    m_inverseDiagonal[x] = (elem != 0) ? 1.0f / elem : 1.0f;
+                }
+            }
+        }
+
+        void apply(const FullVector & x, FullVector & y) const
+        {
+            nvDebugCheck(x.dimension() == m_inverseDiagonal.dimension());
+            nvDebugCheck(y.dimension() == m_inverseDiagonal.dimension());
+
+            // @@ Wrap vector component-wise product into a separate function.
+            const uint D = x.dimension();
+            for (uint i = 0; i < D; i++)
+            {
+                y[i] = m_inverseDiagonal[i] * x[i];
+            }
+        }
+
+    private:
+
+        FullVector m_inverseDiagonal;
+
+    };
+
+} // namespace
+
+
+static bool ConjugateGradientSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon);
+static bool ConjugateGradientSolver(const Preconditioner & preconditioner, const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon);
+
+
+// Solve the symmetric system: At�A�x = At�b
+bool nv::LeastSquaresSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon/*1e-5f*/)
+{
+    nvDebugCheck(A.width() == x.dimension());
+    nvDebugCheck(A.height() == b.dimension());
+    nvDebugCheck(A.height() >= A.width()); // @@ If height == width we could solve it directly...
+
+    const uint D = A.width();
+
+    SparseMatrix At(A.height(), A.width());
+    transpose(A, At);
+
+    FullVector Atb(D);
+    //mult(Transposed, A, b, Atb);
+    mult(At, b, Atb);
+
+    SparseMatrix AtA(D);
+    //mult(Transposed, A, NoTransposed, A, AtA);
+    mult(At, A, AtA);
+
+    return SymmetricSolver(AtA, Atb, x, epsilon);
+}
+
+
+// See section 10.4.3 in: Mesh Parameterization: Theory and Practice, Siggraph Course Notes, August 2007
+bool nv::LeastSquaresSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, const uint * lockedParameters, uint lockedCount, float epsilon/*= 1e-5f*/)
+{
+    nvDebugCheck(A.width() == x.dimension());
+    nvDebugCheck(A.height() == b.dimension());
+    nvDebugCheck(A.height() >= A.width() - lockedCount);
+
+    // @@ This is not the most efficient way of building a system with reduced degrees of freedom. It would be faster to do it on the fly.
+
+    const uint D = A.width() - lockedCount;
+    nvDebugCheck(D > 0);
+
+    // Compute: b - Al * xl
+    FullVector b_Alxl(b);
+
+    for (uint y = 0; y < A.height(); y++)
+    {
+        const uint count = A.getRow(y).count();
+        for (uint e = 0; e < count; e++)
+        {
+            uint column = A.getRow(y)[e].x;
+
+            bool isFree = true;
+            for (uint i = 0; i < lockedCount; i++) 
+            {
+                isFree &= (lockedParameters[i] != column);
+            }
+
+            if (!isFree)
+            {
+                b_Alxl[y] -= x[column] * A.getRow(y)[e].v;
+            }
+        }
+    }
+
+    // Remove locked columns from A.
+    SparseMatrix Af(D, A.height());
+
+    for (uint y = 0; y < A.height(); y++)
+    {
+        const uint count = A.getRow(y).count();
+        for (uint e = 0; e < count; e++)
+        {
+            uint column = A.getRow(y)[e].x;
+            uint ix = column;
+
+            bool isFree = true;
+            for (uint i = 0; i < lockedCount; i++) 
+            {
+                isFree &= (lockedParameters[i] != column);
+                if (column > lockedParameters[i]) ix--; // shift columns
+            }
+
+            if (isFree)
+            {
+                Af.setCoefficient(ix, y, A.getRow(y)[e].v);
+            }
+        }
+    }
+
+    // Remove elements from x
+    FullVector xf(D);
+
+    for (uint i = 0, j = 0; i < A.width(); i++)
+    {
+        bool isFree = true;
+        for (uint l = 0; l < lockedCount; l++) 
+        {
+            isFree &= (lockedParameters[l] != i);
+        }
+
+        if (isFree)
+        {
+            xf[j++] = x[i];
+        }
+    }
+
+    // Solve reduced system.
+    bool result = LeastSquaresSolver(Af, b_Alxl, xf, epsilon);
+
+    // Copy results back to x.
+    for (uint i = 0, j = 0; i < A.width(); i++)
+    {
+        bool isFree = true;
+        for (uint l = 0; l < lockedCount; l++) 
+        {
+            isFree &= (lockedParameters[l] != i);
+        }
+
+        if (isFree)
+        {
+            x[i] = xf[j++];
+        }
+    }
+
+    return result;
+}
+
+
+bool nv::SymmetricSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon/*1e-5f*/)
+{
+    nvDebugCheck(A.height() == A.width());
+    nvDebugCheck(A.height() == b.dimension());
+    nvDebugCheck(b.dimension() == x.dimension());
+
+    JacobiPreconditioner jacobi(A, true);
+    return ConjugateGradientSolver(jacobi, A, b, x, epsilon);
+
+    //return ConjugateGradientSolver(A, b, x, epsilon);
+}
+
+
+/**
+* Compute the solution of the sparse linear system Ab=x using the Conjugate
+* Gradient method.
+*
+* Solving sparse linear systems:
+* (1)		A�x = b
+* 
+* The conjugate gradient algorithm solves (1) only in the case that A is 
+* symmetric and positive definite. It is based on the idea of minimizing the 
+* function
+* 
+* (2)		f(x) = 1/2�x�A�x - b�x
+* 
+* This function is minimized when its gradient
+* 
+* (3)		df = A�x - b
+* 
+* is zero, which is equivalent to (1). The minimization is carried out by 
+* generating a succession of search directions p.k and improved minimizers x.k. 
+* At each stage a quantity alfa.k is found that minimizes f(x.k + alfa.k�p.k), 
+* and x.k+1 is set equal to the new point x.k + alfa.k�p.k. The p.k and x.k are 
+* built up in such a way that x.k+1 is also the minimizer of f over the whole
+* vector space of directions already taken, {p.1, p.2, . . . , p.k}. After N 
+* iterations you arrive at the minimizer over the entire vector space, i.e., the 
+* solution to (1).
+*
+* For a really good explanation of the method see:
+*
+* "An Introduction to the Conjugate Gradient Method Without the Agonizing Pain",
+* Jonhathan Richard Shewchuk.
+*
+**/
+/*static*/ bool ConjugateGradientSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon)
+{
+    nvDebugCheck( A.isSquare() );
+    nvDebugCheck( A.width() == b.dimension() );
+    nvDebugCheck( A.width() == x.dimension() );
+
+    int i = 0;
+    const int D = A.width();
+    const int i_max = 4 * D;   // Convergence should be linear, but in some cases, it's not.
+
+    FullVector r(D);   // residual
+    FullVector p(D);   // search direction
+    FullVector q(D);   // 
+    float delta_0;
+    float delta_old;
+    float delta_new;
+    float alpha;
+    float beta;
+
+    // r = b - A�x;
+    copy(b, r);
+    sgemv(-1, A, x, 1, r);
+
+    // p = r;
+    copy(r, p);
+
+    delta_new = dot( r, r );
+    delta_0 = delta_new;
+
+    while (i < i_max && delta_new > epsilon*epsilon*delta_0)
+    {
+        i++;
+
+        // q = A�p
+        mult(A, p, q);
+
+        // alpha = delta_new / p�q
+        alpha = delta_new / dot( p, q );
+
+        // x = alfa�p + x
+        saxpy(alpha, p, x);
+
+        if ((i & 31) == 0) // recompute r after 32 steps
+        {
+            // r = b - A�x
+            copy(b, r);
+            sgemv(-1, A, x, 1, r);
+        }
+        else
+        {
+            // r = r - alpha�q
+            saxpy(-alpha, q, r);
+        }
+
+        delta_old = delta_new;
+        delta_new = dot( r, r );
+
+        beta = delta_new / delta_old;
+
+        // p = beta�p + r
+        scal(beta, p);
+        saxpy(1, r, p);
+    }
+
+    return delta_new <= epsilon*epsilon*delta_0;
+}
+
+
+// Conjugate gradient with preconditioner.
+/*static*/ bool ConjugateGradientSolver(const Preconditioner & preconditioner, const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon)
+{
+    nvDebugCheck( A.isSquare() );
+    nvDebugCheck( A.width() == b.dimension() );
+    nvDebugCheck( A.width() == x.dimension() );
+
+    int i = 0;
+    const int D = A.width();
+    const int i_max = 4 * D;   // Convergence should be linear, but in some cases, it's not.
+
+    FullVector r(D);    // residual
+    FullVector p(D);    // search direction
+    FullVector q(D);    // 
+    FullVector s(D);    // preconditioned
+    float delta_0;
+    float delta_old;
+    float delta_new;
+    float alpha;
+    float beta;
+
+    // r = b - A�x
+    copy(b, r);
+    sgemv(-1, A, x, 1, r);
+
+
+    // p = M^-1 � r
+    preconditioner.apply(r, p);
+    //copy(r, p);
+
+
+    delta_new = dot(r, p);
+    delta_0 = delta_new;
+
+    while (i < i_max && delta_new > epsilon*epsilon*delta_0)
+    {
+        i++;
+
+        // q = A�p
+        mult(A, p, q);
+
+        // alpha = delta_new / p�q
+        alpha = delta_new / dot(p, q);
+
+        // x = alfa�p + x
+        saxpy(alpha, p, x);
+
+        if ((i & 31) == 0)  // recompute r after 32 steps
+        {			
+            // r = b - A�x
+            copy(b, r);
+            sgemv(-1, A, x, 1, r);
+        }
+        else
+        {
+            // r = r - alfa�q
+            saxpy(-alpha, q, r);
+        }
+
+        // s = M^-1 � r
+        preconditioner.apply(r, s);
+        //copy(r, s);
+
+        delta_old = delta_new;
+        delta_new = dot( r, s );
+
+        beta = delta_new / delta_old;
+
+        // p = s + beta�p
+        scal(beta, p);
+        saxpy(1, s, p);
+    }
+
+    return delta_new <= epsilon*epsilon*delta_0;
+}
+
+
+#if 0 // Nonsymmetric solvers
+
+/** Bi-conjugate gradient method.  */
+MATHLIB_API int BiConjugateGradientSolve( const SparseMatrix &A, const DenseVector &b, DenseVector &x, float epsilon ) {
+    piDebugCheck( A.IsSquare() );
+    piDebugCheck( A.Width() == b.Dim() );
+    piDebugCheck( A.Width() == x.Dim() );
+
+    int i = 0;
+    const int D = A.Width();
+    const int i_max = 4 * D;
+
+    float resid;
+    float rho_1 = 0;
+    float rho_2 = 0;
+    float alpha;
+    float beta;
+
+    DenseVector r(D);
+    DenseVector rtilde(D);
+    DenseVector p(D);
+    DenseVector ptilde(D);
+    DenseVector q(D);
+    DenseVector qtilde(D);
+    DenseVector tmp(D);	// temporal vector.
+
+    // r = b - A�x;
+    A.Product( x, tmp );
+    r.Sub( b, tmp );
+
+    // rtilde = r
+    rtilde.Set( r );
+
+    // p = r;
+    p.Set( r );
+
+    // ptilde = rtilde
+    ptilde.Set( rtilde );
+
+
+
+    float normb = b.Norm();
+    if( normb == 0.0 ) normb = 1;
+
+    // test convergence
+    resid = r.Norm() / normb;
+    if( resid < epsilon ) {
+        // method converges?
+        return 0;
+    }
+
+
+    while( i < i_max ) {
+
+        i++;
+
+        rho_1 = DenseVectorDotProduct( r, rtilde );
+
+        if( rho_1 == 0 ) {
+            // method fails.
+            return -i;
+        }
+
+        if (i == 1) {
+            p.Set( r );
+            ptilde.Set( rtilde );
+        } 
+        else {
+            beta = rho_1 / rho_2;
+
+            // p = r + beta * p;
+            p.Mad( r, p, beta );
+
+            // ptilde = ztilde + beta * ptilde;
+            ptilde.Mad( rtilde, ptilde, beta );
+        }
+
+        // q = A * p;
+        A.Product( p, q );
+
+        // qtilde = A^t * ptilde;
+        A.TransProduct( ptilde, qtilde );
+
+        alpha = rho_1 / DenseVectorDotProduct( ptilde, q );
+
+        // x += alpha * p;
+        x.Mad( x, p, alpha );
+
+        // r -= alpha * q;
+        r.Mad( r, q, -alpha );
+
+        // rtilde -= alpha * qtilde;
+        rtilde.Mad( rtilde, qtilde, -alpha );
+
+        rho_2 = rho_1;
+
+        // test convergence
+        resid = r.Norm() / normb;
+        if( resid < epsilon ) {
+            // method converges
+            return i;
+        }
+    }
+
+    return i;
+}
+
+
+/** Bi-conjugate gradient stabilized method. */
+int BiCGSTABSolve( const SparseMatrix &A, const DenseVector &b, DenseVector &x, float epsilon ) {
+    piDebugCheck( A.IsSquare() );
+    piDebugCheck( A.Width() == b.Dim() );
+    piDebugCheck( A.Width() == x.Dim() );
+
+    int i = 0;
+    const int D = A.Width();
+    const int i_max = 2 * D;
+
+
+    float resid;
+    float rho_1 = 0;
+    float rho_2 = 0;
+    float alpha = 0;
+    float beta = 0;
+    float omega = 0;
+
+    DenseVector p(D);
+    DenseVector phat(D);
+    DenseVector s(D);
+    DenseVector shat(D);
+    DenseVector t(D);
+    DenseVector v(D);
+
+    DenseVector r(D);
+    DenseVector rtilde(D);
+
+    DenseVector tmp(D);
+
+    // r = b - A�x;
+    A.Product( x, tmp );
+    r.Sub( b, tmp );
+
+    // rtilde = r
+    rtilde.Set( r );
+
+
+    float normb = b.Norm();
+    if( normb == 0.0 ) normb = 1;
+
+    // test convergence
+    resid = r.Norm() / normb;
+    if( resid < epsilon ) {
+        // method converges?
+        return 0;
+    }
+
+
+    while( i<i_max ) {
+
+        i++;
+
+        rho_1 = DenseVectorDotProduct( rtilde, r );
+        if( rho_1 == 0 ) {
+            // method fails
+            return -i;
+        }
+
+
+        if( i == 1 ) {
+            p.Set( r );
+        }
+        else {
+            beta = (rho_1 / rho_2) * (alpha / omega);
+
+            // p = r + beta * (p - omega * v);
+            p.Mad( p, v, -omega );
+            p.Mad( r, p, beta );
+        }
+
+        //phat = M.solve(p);
+        phat.Set( p );
+        //Precond( &phat, p );
+
+        //v = A * phat;
+        A.Product( phat, v );
+
+        alpha = rho_1 / DenseVectorDotProduct( rtilde, v );
+
+        // s = r - alpha * v;
+        s.Mad( r, v, -alpha );
+
+
+        resid = s.Norm() / normb;
+        if( resid < epsilon ) {
+            // x += alpha * phat;
+            x.Mad( x, phat, alpha );
+            return i;
+        }
+
+        //shat = M.solve(s);
+        shat.Set( s );
+        //Precond( &shat, s );
+
+        //t = A * shat;
+        A.Product( shat, t );
+
+        omega = DenseVectorDotProduct( t, s ) / DenseVectorDotProduct( t, t );
+
+        // x += alpha * phat + omega * shat;
+        x.Mad( x, shat, omega );
+        x.Mad( x, phat, alpha );
+
+        //r = s - omega * t;
+        r.Mad( s, t, -omega );
+
+        rho_2 = rho_1;
+
+        resid = r.Norm() / normb;
+        if( resid < epsilon ) {
+            return i;
+        }
+
+        if( omega == 0 ) {
+            return -i;	// ???
+        }
+    }
+
+    return i;
+}
+
+
+/** Bi-conjugate gradient stabilized method. */
+int BiCGSTABPrecondSolve( const SparseMatrix &A, const DenseVector &b, DenseVector &x, const IPreconditioner &M, float epsilon ) {
+    piDebugCheck( A.IsSquare() );
+    piDebugCheck( A.Width() == b.Dim() );
+    piDebugCheck( A.Width() == x.Dim() );
+
+    int i = 0;
+    const int D = A.Width();
+    const int i_max = D;
+    //	const int i_max = 1000;
+
+
+    float resid;
+    float rho_1 = 0;
+    float rho_2 = 0;
+    float alpha = 0;
+    float beta = 0;
+    float omega = 0;
+
+    DenseVector p(D);
+    DenseVector phat(D);
+    DenseVector s(D);
+    DenseVector shat(D);
+    DenseVector t(D);
+    DenseVector v(D);
+
+    DenseVector r(D);
+    DenseVector rtilde(D);
+
+    DenseVector tmp(D);
+
+    // r = b - A�x;
+    A.Product( x, tmp );
+    r.Sub( b, tmp );
+
+    // rtilde = r
+    rtilde.Set( r );
+
+
+    float normb = b.Norm();
+    if( normb == 0.0 ) normb = 1;
+
+    // test convergence
+    resid = r.Norm() / normb;
+    if( resid < epsilon ) {
+        // method converges?
+        return 0;
+    }
+
+
+    while( i<i_max ) {
+
+        i++;
+
+        rho_1 = DenseVectorDotProduct( rtilde, r );
+        if( rho_1 == 0 ) {
+            // method fails
+            return -i;
+        }
+
+
+        if( i == 1 ) {
+            p.Set( r );
+        }
+        else {
+            beta = (rho_1 / rho_2) * (alpha / omega);
+
+            // p = r + beta * (p - omega * v);
+            p.Mad( p, v, -omega );
+            p.Mad( r, p, beta );
+        }
+
+        //phat = M.solve(p);
+        //phat.Set( p );
+        M.Precond( &phat, p );
+
+        //v = A * phat;
+        A.Product( phat, v );
+
+        alpha = rho_1 / DenseVectorDotProduct( rtilde, v );
+
+        // s = r - alpha * v;
+        s.Mad( r, v, -alpha );
+
+
+        resid = s.Norm() / normb;
+
+        //printf( "--- Iteration %d: residual = %f\n", i, resid );
+
+        if( resid < epsilon ) {
+            // x += alpha * phat;
+            x.Mad( x, phat, alpha );
+            return i;
+        }
+
+        //shat = M.solve(s);
+        //shat.Set( s );
+        M.Precond( &shat, s );
+
+        //t = A * shat;
+        A.Product( shat, t );
+
+        omega = DenseVectorDotProduct( t, s ) / DenseVectorDotProduct( t, t );
+
+        // x += alpha * phat + omega * shat;
+        x.Mad( x, shat, omega );
+        x.Mad( x, phat, alpha );
+
+        //r = s - omega * t;
+        r.Mad( s, t, -omega );
+
+        rho_2 = rho_1;
+
+        resid = r.Norm() / normb;
+        if( resid < epsilon ) {
+            return i;
+        }
+
+        if( omega == 0 ) {
+            return -i;	// ???
+        }
+    }
+
+    return i;
+}
+
+#endif
diff --git a/thirdparty/thekla_atlas/nvmath/Solver.h b/thirdparty/thekla_atlas/nvmath/Solver.h
new file mode 100644
index 0000000000..2bbf92736a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Solver.h
@@ -0,0 +1,24 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_SOLVER_H
+#define NV_MATH_SOLVER_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+    class SparseMatrix;
+    class FullVector;
+
+
+    // Linear solvers.
+    NVMATH_API bool LeastSquaresSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon = 1e-5f);
+    NVMATH_API bool LeastSquaresSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, const uint * lockedParameters, uint lockedCount, float epsilon = 1e-5f);
+    NVMATH_API bool SymmetricSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon = 1e-5f);
+    //NVMATH_API void NonSymmetricSolver(const SparseMatrix & A, const FullVector & b, FullVector & x, float epsilon = 1e-5f);
+
+} // nv namespace
+
+
+#endif // NV_MATH_SOLVER_H
diff --git a/thirdparty/thekla_atlas/nvmath/Sparse.cpp b/thirdparty/thekla_atlas/nvmath/Sparse.cpp
new file mode 100644
index 0000000000..421e7ee022
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Sparse.cpp
@@ -0,0 +1,889 @@
+// This code is in the public domain -- Ignacio Casta�o <castanyo@yahoo.es>
+
+#include "Sparse.h"
+#include "KahanSum.h"
+
+#include "nvcore/Array.inl"
+
+#define USE_KAHAN_SUM 0
+
+
+using namespace nv;
+
+
+FullVector::FullVector(uint dim)
+{ 
+    m_array.resize(dim); 
+}
+
+FullVector::FullVector(const FullVector & v) : m_array(v.m_array)
+{
+}
+
+const FullVector & FullVector::operator=(const FullVector & v)
+{
+    nvCheck(dimension() == v.dimension());
+
+    m_array = v.m_array;
+
+    return *this;
+}
+
+
+void FullVector::fill(float f)
+{
+    const uint dim = dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        m_array[i] = f;
+    }
+}
+
+void FullVector::operator+= (const FullVector & v)
+{
+    nvDebugCheck(dimension() == v.dimension());
+
+    const uint dim = dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        m_array[i] += v.m_array[i];
+    }
+}
+
+void FullVector::operator-= (const FullVector & v)
+{
+    nvDebugCheck(dimension() == v.dimension());
+
+    const uint dim = dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        m_array[i] -= v.m_array[i];
+    }
+}
+
+void FullVector::operator*= (const FullVector & v)
+{
+    nvDebugCheck(dimension() == v.dimension());
+
+    const uint dim = dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        m_array[i] *= v.m_array[i];
+    }
+}
+
+void FullVector::operator+= (float f)
+{
+    const uint dim = dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        m_array[i] += f;
+    }
+}
+
+void FullVector::operator-= (float f)
+{
+    const uint dim = dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        m_array[i] -= f;
+    }
+}
+
+void FullVector::operator*= (float f)
+{
+    const uint dim = dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        m_array[i] *= f;
+    }
+}
+
+
+void nv::saxpy(float a, const FullVector & x, FullVector & y)
+{
+    nvDebugCheck(x.dimension() == y.dimension());
+
+    const uint dim = x.dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        y[i] += a * x[i];
+    }
+}
+
+void nv::copy(const FullVector & x, FullVector & y)
+{
+    nvDebugCheck(x.dimension() == y.dimension());
+
+    const uint dim = x.dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        y[i] = x[i];
+    }
+}
+
+void nv::scal(float a, FullVector & x)
+{
+    const uint dim = x.dimension();
+    for (uint i = 0; i < dim; i++)
+    {
+        x[i] *= a;
+    }
+}
+
+float nv::dot(const FullVector & x, const FullVector & y)
+{
+    nvDebugCheck(x.dimension() == y.dimension());
+
+    const uint dim = x.dimension();
+
+#if USE_KAHAN_SUM
+    KahanSum kahan;
+    for (uint i = 0; i < dim; i++)
+    {
+        kahan.add(x[i] * y[i]);
+    }
+    return kahan.sum();
+#else
+    float sum = 0;
+    for (uint i = 0; i < dim; i++)
+    {
+        sum += x[i] * y[i];
+    }
+    return sum;
+#endif
+}
+
+
+FullMatrix::FullMatrix(uint d) : m_width(d), m_height(d)
+{
+    m_array.resize(d*d, 0.0f);
+}
+
+FullMatrix::FullMatrix(uint w, uint h) : m_width(w), m_height(h)
+{
+    m_array.resize(w*h, 0.0f);
+}
+
+FullMatrix::FullMatrix(const FullMatrix & m) : m_width(m.m_width), m_height(m.m_height)
+{
+    m_array = m.m_array;
+}
+
+const FullMatrix & FullMatrix::operator=(const FullMatrix & m)
+{
+    nvCheck(width() == m.width());
+    nvCheck(height() == m.height());
+
+    m_array = m.m_array;
+
+    return *this;
+}
+
+
+float FullMatrix::getCoefficient(uint x, uint y) const
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    return m_array[y * width() + x];
+}
+
+void FullMatrix::setCoefficient(uint x, uint y, float f)
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    m_array[y * width() + x] = f;
+}
+
+void FullMatrix::addCoefficient(uint x, uint y, float f)
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    m_array[y * width() + x] += f;
+}
+
+void FullMatrix::mulCoefficient(uint x, uint y, float f)
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    m_array[y * width() + x] *= f;
+}
+
+float FullMatrix::dotRow(uint y, const FullVector & v) const
+{
+    nvDebugCheck( v.dimension() == width() );
+    nvDebugCheck( y < height() );
+
+    float sum = 0;
+
+    const uint count = v.dimension();
+    for (uint i = 0; i < count; i++)
+    {
+        sum += m_array[y * count + i] * v[i];
+    }
+
+    return sum;
+}
+
+void FullMatrix::madRow(uint y, float alpha, FullVector & v) const
+{
+    nvDebugCheck( v.dimension() == width() );
+    nvDebugCheck( y < height() );
+
+    const uint count = v.dimension();
+    for (uint i = 0; i < count; i++)
+    {
+        v[i] += m_array[y * count + i];
+    }
+}
+
+
+// y = M * x
+void nv::mult(const FullMatrix & M, const FullVector & x, FullVector & y)
+{
+    mult(NoTransposed, M, x, y);
+}
+
+void nv::mult(Transpose TM, const FullMatrix & M, const FullVector & x, FullVector & y)
+{
+    const uint w = M.width();
+    const uint h = M.height();
+
+    if (TM == Transposed)
+    {
+        nvDebugCheck( h == x.dimension() );
+        nvDebugCheck( w == y.dimension() );
+
+        y.fill(0.0f);
+
+        for (uint i = 0; i < h; i++)
+        {
+            M.madRow(i, x[i], y);
+        }
+    }
+    else
+    {
+        nvDebugCheck( w == x.dimension() );
+        nvDebugCheck( h == y.dimension() );
+
+        for (uint i = 0; i < h; i++)
+        {
+            y[i] = M.dotRow(i, x);
+        }
+    }
+}
+
+// y = alpha*A*x + beta*y
+void nv::sgemv(float alpha, const FullMatrix & A, const FullVector & x, float beta, FullVector & y)
+{
+    sgemv(alpha, NoTransposed, A, x, beta, y);
+}
+
+void nv::sgemv(float alpha, Transpose TA, const FullMatrix & A, const FullVector & x, float beta, FullVector & y)
+{
+    const uint w = A.width();
+    const uint h = A.height();
+
+    if (TA == Transposed)
+    {
+        nvDebugCheck( h == x.dimension() );
+        nvDebugCheck( w == y.dimension() );
+
+        for (uint i = 0; i < h; i++)
+        {
+            A.madRow(i, alpha * x[i], y);
+        }
+    }
+    else
+    {
+        nvDebugCheck( w == x.dimension() );
+        nvDebugCheck( h == y.dimension() );
+
+        for (uint i = 0; i < h; i++)
+        {
+            y[i] = alpha * A.dotRow(i, x) + beta * y[i];
+        }
+    }
+}
+
+
+// Multiply a row of A by a column of B.
+static float dot(uint j, Transpose TA, const FullMatrix & A, uint i, Transpose TB, const FullMatrix & B)
+{
+    const uint w = (TA == NoTransposed) ? A.width() : A.height();
+    nvDebugCheck(w == ((TB == NoTransposed) ? B.height() : A.width()));
+
+    float sum = 0.0f;
+
+    for (uint k = 0; k < w; k++)
+    {
+        const float a = (TA == NoTransposed) ? A.getCoefficient(k, j) : A.getCoefficient(j, k); // @@ Move branches out of the loop?
+        const float b = (TB == NoTransposed) ? B.getCoefficient(i, k) : A.getCoefficient(k, i);
+        sum += a * b;
+    }
+
+    return sum;
+}
+
+
+// C = A * B
+void nv::mult(const FullMatrix & A, const FullMatrix & B, FullMatrix & C)
+{
+    mult(NoTransposed, A, NoTransposed, B, C);
+}
+
+void nv::mult(Transpose TA, const FullMatrix & A, Transpose TB, const FullMatrix & B, FullMatrix & C)
+{
+    sgemm(1.0f, TA, A, TB, B, 0.0f, C);
+}
+
+// C = alpha*A*B + beta*C
+void nv::sgemm(float alpha, const FullMatrix & A, const FullMatrix & B, float beta, FullMatrix & C)
+{
+    sgemm(alpha, NoTransposed, A, NoTransposed, B, beta, C);
+}
+
+void nv::sgemm(float alpha, Transpose TA, const FullMatrix & A, Transpose TB, const FullMatrix & B, float beta, FullMatrix & C)
+{
+    const uint w = C.width();
+    const uint h = C.height();
+
+    uint aw = (TA == NoTransposed) ? A.width() : A.height();
+    uint ah = (TA == NoTransposed) ? A.height() : A.width();
+    uint bw = (TB == NoTransposed) ? B.width() : B.height();
+    uint bh = (TB == NoTransposed) ? B.height() : B.width();
+
+    nvDebugCheck(aw == bh);
+    nvDebugCheck(bw == ah);
+    nvDebugCheck(w == bw);
+    nvDebugCheck(h == ah);
+
+    for (uint y = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++)
+        {
+            float c = alpha * ::dot(x, TA, A, y, TB, B) + beta * C.getCoefficient(x, y);
+            C.setCoefficient(x, y, c);
+        }
+    }
+}
+
+
+
+
+
+/// Ctor. Init the size of the sparse matrix.
+SparseMatrix::SparseMatrix(uint d) : m_width(d)
+{
+    m_array.resize(d);
+}
+
+/// Ctor. Init the size of the sparse matrix.
+SparseMatrix::SparseMatrix(uint w, uint h) : m_width(w)
+{
+    m_array.resize(h);
+}
+
+SparseMatrix::SparseMatrix(const SparseMatrix & m) : m_width(m.m_width)
+{
+    m_array = m.m_array;
+}
+
+const SparseMatrix & SparseMatrix::operator=(const SparseMatrix & m)
+{
+    nvCheck(width() == m.width());
+    nvCheck(height() == m.height());
+
+    m_array = m.m_array;
+
+    return *this;
+}
+
+
+// x is column, y is row
+float SparseMatrix::getCoefficient(uint x, uint y) const
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    const uint count = m_array[y].count();
+    for (uint i = 0; i < count; i++)
+    {
+        if (m_array[y][i].x == x) return m_array[y][i].v;
+    }
+
+    return 0.0f;
+}
+
+void SparseMatrix::setCoefficient(uint x, uint y, float f)
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    const uint count = m_array[y].count();
+    for (uint i = 0; i < count; i++)
+    {
+        if (m_array[y][i].x == x) 
+        {
+            m_array[y][i].v = f;
+            return;
+        }
+    }
+
+    if (f != 0.0f)
+    {
+        Coefficient c = { x, f };
+        m_array[y].append( c );
+    }
+}
+
+void SparseMatrix::addCoefficient(uint x, uint y, float f)
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    if (f != 0.0f)
+    {
+        const uint count = m_array[y].count();
+        for (uint i = 0; i < count; i++)
+        {
+            if (m_array[y][i].x == x) 
+            {
+                m_array[y][i].v += f;
+                return;
+            }
+        }
+
+        Coefficient c = { x, f };
+        m_array[y].append( c );
+    }
+}
+
+void SparseMatrix::mulCoefficient(uint x, uint y, float f)
+{
+    nvDebugCheck( x < width() );
+    nvDebugCheck( y < height() );
+
+    const uint count = m_array[y].count();
+    for (uint i = 0; i < count; i++)
+    {
+        if (m_array[y][i].x == x) 
+        {
+            m_array[y][i].v *= f;
+            return;
+        }
+    }
+
+    if (f != 0.0f)
+    {
+        Coefficient c = { x, f };
+        m_array[y].append( c );
+    }
+}
+
+
+float SparseMatrix::sumRow(uint y) const
+{
+    nvDebugCheck( y < height() );
+
+    const uint count = m_array[y].count();
+
+#if USE_KAHAN_SUM
+    KahanSum kahan;
+    for (uint i = 0; i < count; i++)
+    {
+        kahan.add(m_array[y][i].v);
+    }
+    return kahan.sum();
+#else
+    float sum = 0;
+    for (uint i = 0; i < count; i++)
+    {
+        sum += m_array[y][i].v;
+    }
+    return sum;
+#endif
+}
+
+float SparseMatrix::dotRow(uint y, const FullVector & v) const
+{
+    nvDebugCheck( y < height() );
+
+    const uint count = m_array[y].count();
+
+#if USE_KAHAN_SUM
+    KahanSum kahan;
+    for (uint i = 0; i < count; i++)
+    {
+        kahan.add(m_array[y][i].v * v[m_array[y][i].x]);
+    }
+    return kahan.sum();
+#else
+    float sum = 0;
+    for (uint i = 0; i < count; i++)
+    {
+        sum += m_array[y][i].v * v[m_array[y][i].x];
+    }
+    return sum;
+#endif
+}
+
+void SparseMatrix::madRow(uint y, float alpha, FullVector & v) const
+{
+    nvDebugCheck(y < height());
+
+    const uint count = m_array[y].count();
+    for (uint i = 0; i < count; i++)
+    {
+        v[m_array[y][i].x] += alpha * m_array[y][i].v;
+    }
+}
+
+
+void SparseMatrix::clearRow(uint y)
+{
+    nvDebugCheck( y < height() );
+
+    m_array[y].clear();
+}
+
+void SparseMatrix::scaleRow(uint y, float f)
+{
+    nvDebugCheck( y < height() );
+
+    const uint count = m_array[y].count();
+    for (uint i = 0; i < count; i++)
+    {
+        m_array[y][i].v *= f;
+    }
+}
+
+void SparseMatrix::normalizeRow(uint y)
+{
+    nvDebugCheck( y < height() );
+
+    float norm = 0.0f;
+
+    const uint count = m_array[y].count();
+    for (uint i = 0; i < count; i++)
+    {
+        float f = m_array[y][i].v;
+        norm += f * f;
+    }
+
+    scaleRow(y, 1.0f / sqrtf(norm));
+}
+
+
+void SparseMatrix::clearColumn(uint x)
+{
+    nvDebugCheck(x < width());
+
+    for (uint y = 0; y < height(); y++)
+    {
+        const uint count = m_array[y].count();
+        for (uint e = 0; e < count; e++)
+        {
+            if (m_array[y][e].x == x)
+            {
+                m_array[y][e].v = 0.0f;
+                break;
+            }
+        }
+    }
+}
+
+void SparseMatrix::scaleColumn(uint x, float f)
+{
+    nvDebugCheck(x < width());
+
+    for (uint y = 0; y < height(); y++)
+    {
+        const uint count = m_array[y].count();
+        for (uint e = 0; e < count; e++)
+        {
+            if (m_array[y][e].x == x)
+            {
+                m_array[y][e].v *= f;
+                break;
+            }
+        }
+    }
+}
+
+const Array<SparseMatrix::Coefficient> & SparseMatrix::getRow(uint y) const
+{
+    return m_array[y];
+}
+
+
+bool SparseMatrix::isSymmetric() const
+{
+    for (uint y = 0; y < height(); y++)
+    {
+        const uint count = m_array[y].count();
+        for (uint e = 0; e < count; e++)
+        {
+            const uint x = m_array[y][e].x;
+            if (x > y) {
+                float v = m_array[y][e].v;
+
+                if (!equal(getCoefficient(y, x), v)) {  // @@ epsilon
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+
+// y = M * x
+void nv::mult(const SparseMatrix & M, const FullVector & x, FullVector & y)
+{
+    mult(NoTransposed, M, x, y);
+}
+
+void nv::mult(Transpose TM, const SparseMatrix & M, const FullVector & x, FullVector & y)
+{
+    const uint w = M.width();
+    const uint h = M.height();
+
+    if (TM == Transposed)
+    {
+        nvDebugCheck( h == x.dimension() );
+        nvDebugCheck( w == y.dimension() );
+
+        y.fill(0.0f);
+
+        for (uint i = 0; i < h; i++)
+        {
+            M.madRow(i, x[i], y);
+        }
+    }
+    else
+    {
+        nvDebugCheck( w == x.dimension() );
+        nvDebugCheck( h == y.dimension() );
+
+        for (uint i = 0; i < h; i++)
+        {
+            y[i] = M.dotRow(i, x);
+        }
+    }
+}
+
+// y = alpha*A*x + beta*y
+void nv::sgemv(float alpha, const SparseMatrix & A, const FullVector & x, float beta, FullVector & y)
+{
+    sgemv(alpha, NoTransposed, A, x, beta, y);
+}
+
+void nv::sgemv(float alpha, Transpose TA, const SparseMatrix & A, const FullVector & x, float beta, FullVector & y)
+{
+    const uint w = A.width();
+    const uint h = A.height();
+
+    if (TA == Transposed)
+    {
+        nvDebugCheck( h == x.dimension() );
+        nvDebugCheck( w == y.dimension() );
+
+        for (uint i = 0; i < h; i++)
+        {
+            A.madRow(i, alpha * x[i], y);
+        }
+    }
+    else
+    {
+        nvDebugCheck( w == x.dimension() );
+        nvDebugCheck( h == y.dimension() );
+
+        for (uint i = 0; i < h; i++)
+        {
+            y[i] = alpha * A.dotRow(i, x) + beta * y[i];
+        }
+    }
+}
+
+
+// dot y-row of A by x-column of B
+static float dotRowColumn(int y, const SparseMatrix & A, int x, const SparseMatrix & B)
+{
+    const Array<SparseMatrix::Coefficient> & row = A.getRow(y);
+
+    const uint count = row.count();
+
+#if USE_KAHAN_SUM
+    KahanSum kahan;
+    for (uint i = 0; i < count; i++)
+    {
+        const SparseMatrix::Coefficient & c = row[i];
+        kahan.add(c.v * B.getCoefficient(x, c.x));
+    }
+    return kahan.sum();
+#else
+    float sum = 0.0f;
+    for (uint i = 0; i < count; i++)
+    {
+        const SparseMatrix::Coefficient & c = row[i];
+        sum += c.v * B.getCoefficient(x, c.x);
+    }
+    return sum;
+#endif
+}
+
+// dot y-row of A by x-row of B
+static float dotRowRow(int y, const SparseMatrix & A, int x, const SparseMatrix & B)
+{
+    const Array<SparseMatrix::Coefficient> & row = A.getRow(y);
+
+    const uint count = row.count();
+
+#if USE_KAHAN_SUM
+    KahanSum kahan;
+    for (uint i = 0; i < count; i++)
+    {
+        const SparseMatrix::Coefficient & c = row[i];
+        kahan.add(c.v * B.getCoefficient(c.x, x));
+    }
+    return kahan.sum();
+#else
+    float sum = 0.0f;
+    for (uint i = 0; i < count; i++)
+    {
+        const SparseMatrix::Coefficient & c = row[i];
+        sum += c.v * B.getCoefficient(c.x, x);
+    }
+    return sum;
+#endif
+}
+
+// dot y-column of A by x-column of B
+static float dotColumnColumn(int y, const SparseMatrix & A, int x, const SparseMatrix & B)
+{
+    nvDebugCheck(A.height() == B.height());
+
+    const uint h = A.height();
+
+#if USE_KAHAN_SUM
+    KahanSum kahan;
+    for (uint i = 0; i < h; i++)
+    {
+        kahan.add(A.getCoefficient(y, i) * B.getCoefficient(x, i));
+    }
+    return kahan.sum();
+#else
+    float sum = 0.0f;
+    for (uint i = 0; i < h; i++)
+    {
+        sum += A.getCoefficient(y, i) * B.getCoefficient(x, i);
+    }
+    return sum;
+#endif
+}
+
+
+void nv::transpose(const SparseMatrix & A, SparseMatrix & B)
+{
+    nvDebugCheck(A.width() == B.height());
+    nvDebugCheck(B.width() == A.height());
+
+    const uint w = A.width();
+    for (uint x = 0; x < w; x++)
+    {
+        B.clearRow(x);
+    }
+
+    const uint h = A.height();
+    for (uint y = 0; y < h; y++)
+    {
+        const Array<SparseMatrix::Coefficient> & row = A.getRow(y);
+
+        const uint count = row.count();
+        for (uint i = 0; i < count; i++)
+        {
+            const SparseMatrix::Coefficient & c = row[i];
+            nvDebugCheck(c.x < w);
+
+            B.setCoefficient(y, c.x, c.v);
+        }
+    }
+}
+
+// C = A * B
+void nv::mult(const SparseMatrix & A, const SparseMatrix & B, SparseMatrix & C)
+{
+    mult(NoTransposed, A, NoTransposed, B, C);
+}
+
+void nv::mult(Transpose TA, const SparseMatrix & A, Transpose TB, const SparseMatrix & B, SparseMatrix & C)
+{
+    sgemm(1.0f, TA, A, TB, B, 0.0f, C);
+}
+
+// C = alpha*A*B + beta*C
+void nv::sgemm(float alpha, const SparseMatrix & A, const SparseMatrix & B, float beta, SparseMatrix & C)
+{
+    sgemm(alpha, NoTransposed, A, NoTransposed, B, beta, C);
+}
+
+void nv::sgemm(float alpha, Transpose TA, const SparseMatrix & A, Transpose TB, const SparseMatrix & B, float beta, SparseMatrix & C)
+{
+    const uint w = C.width();
+    const uint h = C.height();
+
+    uint aw = (TA == NoTransposed) ? A.width() : A.height();
+    uint ah = (TA == NoTransposed) ? A.height() : A.width();
+    uint bw = (TB == NoTransposed) ? B.width() : B.height();
+    uint bh = (TB == NoTransposed) ? B.height() : B.width();
+
+    nvDebugCheck(aw == bh);
+    nvDebugCheck(bw == ah);
+    nvDebugCheck(w == bw);
+    nvDebugCheck(h == ah);
+
+
+    for (uint y = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++)
+        {
+            float c = beta * C.getCoefficient(x, y);
+
+            if (TA == NoTransposed && TB == NoTransposed)
+            {
+                // dot y-row of A by x-column of B.
+                c += alpha * dotRowColumn(y, A, x, B);
+            }
+            else if (TA == Transposed && TB == Transposed)
+            {
+                // dot y-column of A by x-row of B.
+                c += alpha * dotRowColumn(x, B, y, A);
+            }
+            else if (TA == Transposed && TB == NoTransposed)
+            {
+                // dot y-column of A by x-column of B.
+                c += alpha * dotColumnColumn(y, A, x, B);
+            }
+            else if (TA == NoTransposed && TB == Transposed)
+            {
+                // dot y-row of A by x-row of B.
+                c += alpha * dotRowRow(y, A, x, B);
+            }
+
+            C.setCoefficient(x, y, c);
+        }
+    }
+}
+
+// C = At * A
+void nv::sqm(const SparseMatrix & A, SparseMatrix & C)
+{
+    // This is quite expensive...
+    mult(Transposed, A, NoTransposed, A, C);
+}
diff --git a/thirdparty/thekla_atlas/nvmath/Sparse.h b/thirdparty/thekla_atlas/nvmath/Sparse.h
new file mode 100644
index 0000000000..6b03ed51f3
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Sparse.h
@@ -0,0 +1,204 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_SPARSE_H
+#define NV_MATH_SPARSE_H
+
+#include "nvmath.h"
+#include "nvcore/Array.h"
+
+
+// Full and sparse vector and matrix classes. BLAS subset.
+
+namespace nv
+{
+    class FullVector;
+    class FullMatrix;
+    class SparseMatrix;
+
+
+    /// Fixed size vector class.
+    class FullVector
+    {
+    public:
+
+        FullVector(uint dim);
+        FullVector(const FullVector & v);
+
+        const FullVector & operator=(const FullVector & v);
+
+        uint dimension() const { return m_array.count(); }
+
+        const float & operator[]( uint index ) const { return m_array[index]; }
+        float & operator[] ( uint index ) { return m_array[index]; }
+
+        void fill(float f);
+
+        void operator+= (const FullVector & v);
+        void operator-= (const FullVector & v);
+        void operator*= (const FullVector & v);
+
+        void operator+= (float f);
+        void operator-= (float f);
+        void operator*= (float f);
+
+
+    private:
+
+        Array<float> m_array;
+
+    };
+
+    // Pseudo-BLAS interface.
+    NVMATH_API void saxpy(float a, const FullVector & x, FullVector & y); // y = a * x + y
+    NVMATH_API void copy(const FullVector & x, FullVector & y);
+    NVMATH_API void scal(float a, FullVector & x);
+    NVMATH_API float dot(const FullVector & x, const FullVector & y);
+
+
+    enum Transpose
+    {
+        NoTransposed = 0,
+        Transposed = 1
+    };
+
+    /// Full matrix class.
+    class FullMatrix
+    {
+    public:
+
+        FullMatrix(uint d);
+        FullMatrix(uint w, uint h);
+        FullMatrix(const FullMatrix & m);
+
+        const FullMatrix & operator=(const FullMatrix & m);
+
+        uint width() const { return m_width; }
+        uint height() const { return m_height; }
+        bool isSquare() const { return m_width == m_height; }
+
+        float getCoefficient(uint x, uint y) const;
+
+        void setCoefficient(uint x, uint y, float f);
+        void addCoefficient(uint x, uint y, float f);
+        void mulCoefficient(uint x, uint y, float f);
+
+        float dotRow(uint y, const FullVector & v) const;
+        void madRow(uint y, float alpha, FullVector & v) const;
+
+    protected:
+
+        bool isValid() const {
+            return m_array.size() == (m_width * m_height);
+        }
+
+    private:
+
+        const uint m_width;
+        const uint m_height;
+        Array<float> m_array;
+
+    };
+
+    NVMATH_API void mult(const FullMatrix & M, const FullVector & x, FullVector & y);
+    NVMATH_API void mult(Transpose TM, const FullMatrix & M, const FullVector & x, FullVector & y);
+
+    // y = alpha*A*x + beta*y
+    NVMATH_API void sgemv(float alpha, const FullMatrix & A, const FullVector & x, float beta, FullVector & y);
+    NVMATH_API void sgemv(float alpha, Transpose TA, const FullMatrix & A, const FullVector & x, float beta, FullVector & y);
+
+    NVMATH_API void mult(const FullMatrix & A, const FullMatrix & B, FullMatrix & C);
+    NVMATH_API void mult(Transpose TA, const FullMatrix & A, Transpose TB, const FullMatrix & B, FullMatrix & C);
+
+    // C = alpha*A*B + beta*C
+    NVMATH_API void sgemm(float alpha, const FullMatrix & A, const FullMatrix & B, float beta, FullMatrix & C);
+    NVMATH_API void sgemm(float alpha, Transpose TA, const FullMatrix & A, Transpose TB, const FullMatrix & B, float beta, FullMatrix & C);
+
+
+    /**
+    * Sparse matrix class. The matrix is assumed to be sparse and to have
+    * very few non-zero elements, for this reason it's stored in indexed 
+    * format. To multiply column vectors efficiently, the matrix stores 
+    * the elements in indexed-column order, there is a list of indexed 
+    * elements for each row of the matrix. As with the FullVector the 
+    * dimension of the matrix is constant.
+    **/
+    class SparseMatrix
+    {
+        friend class FullMatrix;
+    public:
+
+        // An element of the sparse array.
+        struct Coefficient {
+            uint x;  // column
+            float v; // value
+        };
+
+
+    public:
+
+        SparseMatrix(uint d);
+        SparseMatrix(uint w, uint h);
+        SparseMatrix(const SparseMatrix & m);
+
+        const SparseMatrix & operator=(const SparseMatrix & m);
+
+
+        uint width() const { return m_width; }
+        uint height() const { return m_array.count(); }
+        bool isSquare() const { return width() == height(); }
+
+        float getCoefficient(uint x, uint y) const; // x is column, y is row
+
+        void setCoefficient(uint x, uint y, float f);
+        void addCoefficient(uint x, uint y, float f);
+        void mulCoefficient(uint x, uint y, float f);
+
+        float sumRow(uint y) const;
+        float dotRow(uint y, const FullVector & v) const;
+        void madRow(uint y, float alpha, FullVector & v) const;
+
+        void clearRow(uint y);
+        void scaleRow(uint y, float f);
+        void normalizeRow(uint y);
+
+        void clearColumn(uint x);
+        void scaleColumn(uint x, float f);
+
+        const Array<Coefficient> & getRow(uint y) const;
+
+        bool isSymmetric() const;
+
+    private:
+
+        /// Number of columns.
+        const uint m_width;
+
+        /// Array of matrix elements.
+        Array< Array<Coefficient> > m_array;
+
+    };
+
+    NVMATH_API void transpose(const SparseMatrix & A, SparseMatrix & B);
+
+    NVMATH_API void mult(const SparseMatrix & M, const FullVector & x, FullVector & y);
+    NVMATH_API void mult(Transpose TM, const SparseMatrix & M, const FullVector & x, FullVector & y);
+
+    // y = alpha*A*x + beta*y
+    NVMATH_API void sgemv(float alpha, const SparseMatrix & A, const FullVector & x, float beta, FullVector & y);
+    NVMATH_API void sgemv(float alpha, Transpose TA, const SparseMatrix & A, const FullVector & x, float beta, FullVector & y);
+
+    NVMATH_API void mult(const SparseMatrix & A, const SparseMatrix & B, SparseMatrix & C);
+    NVMATH_API void mult(Transpose TA, const SparseMatrix & A, Transpose TB, const SparseMatrix & B, SparseMatrix & C);
+
+    // C = alpha*A*B + beta*C
+    NVMATH_API void sgemm(float alpha, const SparseMatrix & A, const SparseMatrix & B, float beta, SparseMatrix & C);
+    NVMATH_API void sgemm(float alpha, Transpose TA, const SparseMatrix & A, Transpose TB, const SparseMatrix & B, float beta, SparseMatrix & C);
+
+    // C = At * A
+    NVMATH_API void sqm(const SparseMatrix & A, SparseMatrix & C);
+
+} // nv namespace
+
+
+#endif // NV_MATH_SPARSE_H
diff --git a/thirdparty/thekla_atlas/nvmath/Sphere.cpp b/thirdparty/thekla_atlas/nvmath/Sphere.cpp
new file mode 100644
index 0000000000..e0c1ad652c
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Sphere.cpp
@@ -0,0 +1,431 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "Sphere.h"
+#include "Vector.inl"
+#include "Box.inl"
+
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+
+const float radiusEpsilon = 1e-4f;
+
+Sphere::Sphere(Vector3::Arg p0, Vector3::Arg p1)
+{
+    if (p0 == p1) *this = Sphere(p0);
+    else {
+        center = (p0 + p1) * 0.5f;
+        radius = length(p0 - center) + radiusEpsilon;
+
+        float d0 = length(p0 - center);
+        float d1 = length(p1 - center);
+        nvDebugCheck(equal(d0, radius - radiusEpsilon));
+        nvDebugCheck(equal(d1, radius - radiusEpsilon));
+    }
+}
+
+Sphere::Sphere(Vector3::Arg p0, Vector3::Arg p1, Vector3::Arg p2)
+{
+    if (p0 == p1 || p0 == p2) *this = Sphere(p1, p2);
+    else if (p1 == p2) *this = Sphere(p0, p2);
+    else {
+        Vector3 a = p1 - p0;
+        Vector3 b = p2 - p0;
+        Vector3 c = cross(a, b);
+
+        float denominator = 2.0f * lengthSquared(c);
+        
+        if (!isZero(denominator)) {
+	        Vector3 d = (lengthSquared(b) * cross(c, a) + lengthSquared(a) * cross(b, c)) / denominator;
+
+	        center = p0 + d;
+	        radius = length(d) + radiusEpsilon;
+
+            float d0 = length(p0 - center);
+            float d1 = length(p1 - center);
+            float d2 = length(p2 - center);
+            nvDebugCheck(equal(d0, radius - radiusEpsilon));
+            nvDebugCheck(equal(d1, radius - radiusEpsilon));
+            nvDebugCheck(equal(d2, radius - radiusEpsilon));
+        }
+        else {
+            // @@ This is a specialization of the code below, but really, the only thing we need to do here is to find the two most distant points.
+            // Compute all possible spheres, invalidate those that do not contain the four points, keep the smallest.
+            Sphere s0(p1, p2);
+            float d0 = distanceSquared(s0, p0);
+            if (d0 > 0) s0.radius = NV_FLOAT_MAX;
+
+            Sphere s1(p0, p2);
+            float d1 = distanceSquared(s1, p1);
+            if (d1 > 0) s1.radius = NV_FLOAT_MAX;
+
+            Sphere s2(p0, p1);
+            float d2 = distanceSquared(s2, p2);
+            if (d2 > 0) s1.radius = NV_FLOAT_MAX;
+
+            if (s0.radius < s1.radius && s0.radius < s2.radius) {
+                center = s0.center;
+                radius = s0.radius;
+            }
+            else if (s1.radius < s2.radius) {
+                center = s1.center;
+                radius = s1.radius;
+            }
+            else {
+                center = s2.center;
+                radius = s2.radius;
+            }
+        }
+    }
+}
+
+Sphere::Sphere(Vector3::Arg p0, Vector3::Arg p1, Vector3::Arg p2, Vector3::Arg p3)
+{
+    if (p0 == p1 || p0 == p2 || p0 == p3) *this = Sphere(p1, p2, p3);
+    else if (p1 == p2 || p1 == p3) *this = Sphere(p0, p2, p3);
+    else if (p2 == p3) *this = Sphere(p0, p1, p2);
+    else {
+        // @@ This only works if the points are not coplanar!
+	    Vector3 a = p1 - p0;
+	    Vector3 b = p2 - p0;
+	    Vector3 c = p3 - p0;
+
+        float denominator = 2.0f * dot(c, cross(a, b)); // triple product.
+
+        if (!isZero(denominator)) {
+	        Vector3 d = (lengthSquared(c) * cross(a, b) + lengthSquared(b) * cross(c, a) + lengthSquared(a) * cross(b, c)) / denominator;
+
+	        center = p0 + d;
+            radius = length(d) + radiusEpsilon;
+
+            float d0 = length(p0 - center);
+            float d1 = length(p1 - center);
+            float d2 = length(p2 - center);
+            float d3 = length(p3 - center);
+            nvDebugCheck(equal(d0, radius - radiusEpsilon));
+            nvDebugCheck(equal(d1, radius - radiusEpsilon));
+            nvDebugCheck(equal(d2, radius - radiusEpsilon));
+            nvDebugCheck(equal(d3, radius - radiusEpsilon));
+        }
+        else {
+            // Compute all possible spheres, invalidate those that do not contain the four points, keep the smallest.
+            Sphere s0(p1, p2, p3);
+            float d0 = distanceSquared(s0, p0);
+            if (d0 > 0) s0.radius = NV_FLOAT_MAX;
+
+            Sphere s1(p0, p2, p3);
+            float d1 = distanceSquared(s1, p1);
+            if (d1 > 0) s1.radius = NV_FLOAT_MAX;
+
+            Sphere s2(p0, p1, p3);
+            float d2 = distanceSquared(s2, p2);
+            if (d2 > 0) s2.radius = NV_FLOAT_MAX;
+
+            Sphere s3(p0, p1, p2);
+            float d3 = distanceSquared(s3, p3);
+            if (d3 > 0) s2.radius = NV_FLOAT_MAX;
+
+            if (s0.radius < s1.radius && s0.radius < s2.radius && s0.radius < s3.radius) {
+                center = s0.center;
+                radius = s0.radius;
+            }
+            else if (s1.radius < s2.radius && s1.radius < s3.radius) {
+                center = s1.center;
+                radius = s1.radius;
+            }
+            else if (s1.radius < s3.radius) {
+                center = s2.center;
+                radius = s2.radius;
+            }
+            else {
+                center = s3.center;
+                radius = s3.radius;
+            }
+        }
+    }
+}
+
+
+float nv::distanceSquared(const Sphere & sphere, const Vector3 & point)
+{
+    return lengthSquared(sphere.center - point) - square(sphere.radius);
+}
+
+
+
+// Implementation of "MiniBall" based on:
+// http://www.flipcode.com/archives/Smallest_Enclosing_Spheres.shtml
+
+static Sphere recurseMini(const Vector3 *P[], uint p, uint b = 0)
+{
+	Sphere MB;
+
+	switch(b)
+	{
+	case 0:
+		MB = Sphere(*P[0]);
+		break;
+	case 1:
+		MB = Sphere(*P[-1]);
+		break;
+	case 2:
+		MB = Sphere(*P[-1], *P[-2]);
+		break;
+	case 3:
+		MB = Sphere(*P[-1], *P[-2], *P[-3]);
+		break;
+	case 4:
+		MB = Sphere(*P[-1], *P[-2], *P[-3], *P[-4]);
+		return MB;
+	}
+
+	for (uint i = 0; i < p; i++)
+    {
+        if (distanceSquared(MB, *P[i]) > 0)   // Signed square distance to sphere
+		{
+			for (uint j = i; j > 0; j--)
+			{
+                swap(P[j], P[j-1]);
+			}
+
+			MB = recurseMini(P + 1, i, b + 1);
+		}
+    }
+
+	return MB;
+}
+
+static bool allInside(const Sphere & sphere, const Vector3 * pointArray, const uint pointCount) {
+    for (uint i = 0; i < pointCount; i++) {
+        if (distanceSquared(sphere, pointArray[i]) >= NV_EPSILON) {
+            return false;
+        }
+    }
+    return true;
+}
+
+
+Sphere nv::miniBall(const Vector3 * pointArray, const uint pointCount)
+{
+    nvDebugCheck(pointArray != NULL);
+    nvDebugCheck(pointCount > 0);
+
+	const Vector3 **L = new const Vector3*[pointCount];
+
+    for (uint i = 0; i < pointCount; i++) {
+		L[i] = &pointArray[i];
+    }
+
+	Sphere sphere = recurseMini(L, pointCount);
+
+	delete [] L;
+
+    nvDebugCheck(allInside(sphere, pointArray, pointCount));
+
+	return sphere;
+}
+
+
+// Approximate bounding sphere, based on "An Efficient Bounding Sphere" by Jack Ritter, from "Graphics Gems"
+Sphere nv::approximateSphere_Ritter(const Vector3 * pointArray, const uint pointCount)
+{
+    nvDebugCheck(pointArray != NULL);
+    nvDebugCheck(pointCount > 0);
+
+    Vector3 xmin, xmax, ymin, ymax, zmin, zmax;
+
+    xmin = xmax = ymin = ymax = zmin = zmax = pointArray[0];
+
+    // FIRST PASS: find 6 minima/maxima points
+    xmin.x = ymin.y = zmin.z = FLT_MAX;
+    xmax.x = ymax.y = zmax.z = -FLT_MAX;
+
+    for (uint i = 0; i < pointCount; i++)
+	{
+        const Vector3 & p = pointArray[i];
+        if (p.x < xmin.x) xmin = p;
+	    if (p.x > xmax.x) xmax = p;
+	    if (p.y < ymin.y) ymin = p;
+	    if (p.y > ymax.y) ymax = p;
+	    if (p.z < zmin.z) zmin = p;
+	    if (p.z > zmax.z) zmax = p;
+	}
+
+    float xspan = lengthSquared(xmax - xmin);
+    float yspan = lengthSquared(ymax - ymin);
+    float zspan = lengthSquared(zmax - zmin);
+
+    // Set points dia1 & dia2 to the maximally separated pair.
+    Vector3 dia1 = xmin; 
+    Vector3 dia2 = xmax;
+    float maxspan = xspan;
+    if (yspan > maxspan) {
+	    maxspan = yspan;
+	    dia1 = ymin;
+        dia2 = ymax;
+	}
+    if (zspan > maxspan) {
+	    dia1 = zmin;
+        dia2 = zmax;
+	}
+
+    // |dia1-dia2| is a diameter of initial sphere
+    
+    // calc initial center
+    Sphere sphere;
+    sphere.center = (dia1 + dia2) / 2.0f;
+
+    // calculate initial radius**2 and radius
+    float rad_sq = lengthSquared(dia2 - sphere.center);
+    sphere.radius = sqrtf(rad_sq);
+
+
+    // SECOND PASS: increment current sphere
+    for (uint i = 0; i < pointCount; i++)
+	{
+        const Vector3 & p = pointArray[i];
+
+        float old_to_p_sq = lengthSquared(p - sphere.center);
+
+	    if (old_to_p_sq > rad_sq) 	// do r**2 test first
+		{ 	
+            // this point is outside of current sphere
+		    float old_to_p = sqrtf(old_to_p_sq);
+
+		    // calc radius of new sphere
+            sphere.radius = (sphere.radius + old_to_p) / 2.0f;
+		    rad_sq = sphere.radius * sphere.radius; 	// for next r**2 compare
+    		
+            float old_to_new = old_to_p - sphere.radius;
+
+		    // calc center of new sphere
+            sphere.center = (sphere.radius * sphere.center + old_to_new * p) / old_to_p;
+		}	
+	}
+
+    nvDebugCheck(allInside(sphere, pointArray, pointCount));
+
+    return sphere;
+}
+
+
+static float computeSphereRadius(const Vector3 & center, const Vector3 * pointArray, const uint pointCount) {
+
+    float maxRadius2 = 0;
+
+    for (uint i = 0; i < pointCount; i++)
+	{
+        const Vector3 & p = pointArray[i];
+
+        float r2 = lengthSquared(center - p);
+
+        if (r2 > maxRadius2) {
+            maxRadius2 = r2;
+        }
+    }
+
+    return sqrtf(maxRadius2) + radiusEpsilon;
+}
+
+
+Sphere nv::approximateSphere_AABB(const Vector3 * pointArray, const uint pointCount)
+{
+    nvDebugCheck(pointArray != NULL);
+    nvDebugCheck(pointCount > 0);
+
+    Box box;
+    box.clearBounds();
+
+    for (uint i = 0; i < pointCount; i++) {
+        box.addPointToBounds(pointArray[i]);
+    }
+
+    Sphere sphere;
+    sphere.center = box.center();
+    sphere.radius = computeSphereRadius(sphere.center, pointArray, pointCount);
+
+    nvDebugCheck(allInside(sphere, pointArray, pointCount));
+
+    return sphere;
+}
+
+
+static void computeExtremalPoints(const Vector3 & dir, const Vector3 * pointArray, uint pointCount, Vector3 * minPoint, Vector3 * maxPoint) {
+    nvDebugCheck(pointCount > 0);
+
+    uint mini = 0;
+    uint maxi = 0;
+    float minDist = FLT_MAX;
+    float maxDist = -FLT_MAX;
+
+    for (uint i = 0; i < pointCount; i++) {
+        float d = dot(dir, pointArray[i]);
+
+        if (d < minDist) {
+            minDist = d;
+            mini = i;
+        }
+        if (d > maxDist) {
+            maxDist = d;
+            maxi = i;
+        }
+    }
+    nvDebugCheck(minDist != FLT_MAX);
+    nvDebugCheck(maxDist != -FLT_MAX);
+
+    *minPoint = pointArray[mini];
+    *maxPoint = pointArray[maxi];
+}
+
+// EPOS algorithm based on:
+// http://www.ep.liu.se/ecp/034/009/ecp083409.pdf
+Sphere nv::approximateSphere_EPOS6(const Vector3 * pointArray, uint pointCount)
+{
+    nvDebugCheck(pointArray != NULL);
+    nvDebugCheck(pointCount > 0);
+
+    Vector3 extremalPoints[6];
+
+    // Compute 6 extremal points.
+    computeExtremalPoints(Vector3(1, 0, 0), pointArray, pointCount, extremalPoints+0, extremalPoints+1);
+    computeExtremalPoints(Vector3(0, 1, 0), pointArray, pointCount, extremalPoints+2, extremalPoints+3);
+    computeExtremalPoints(Vector3(0, 0, 1), pointArray, pointCount, extremalPoints+4, extremalPoints+5);
+
+    Sphere sphere = miniBall(extremalPoints, 6);
+    sphere.radius = computeSphereRadius(sphere.center, pointArray, pointCount);
+
+    nvDebugCheck(allInside(sphere, pointArray, pointCount));
+
+    return sphere;
+}
+
+Sphere nv::approximateSphere_EPOS14(const Vector3 * pointArray, uint pointCount)
+{
+    nvDebugCheck(pointArray != NULL);
+    nvDebugCheck(pointCount > 0);
+
+    Vector3 extremalPoints[14];
+
+    // Compute 14 extremal points.
+    computeExtremalPoints(Vector3(1, 0, 0), pointArray, pointCount, extremalPoints+0, extremalPoints+1);
+    computeExtremalPoints(Vector3(0, 1, 0), pointArray, pointCount, extremalPoints+2, extremalPoints+3);
+    computeExtremalPoints(Vector3(0, 0, 1), pointArray, pointCount, extremalPoints+4, extremalPoints+5);
+
+    float d = sqrtf(1.0f/3.0f);
+
+    computeExtremalPoints(Vector3(d, d, d), pointArray, pointCount, extremalPoints+6, extremalPoints+7);
+    computeExtremalPoints(Vector3(-d, d, d), pointArray, pointCount, extremalPoints+8, extremalPoints+9);
+    computeExtremalPoints(Vector3(-d, -d, d), pointArray, pointCount, extremalPoints+10, extremalPoints+11);
+    computeExtremalPoints(Vector3(d, -d, d), pointArray, pointCount, extremalPoints+12, extremalPoints+13);
+
+
+    Sphere sphere = miniBall(extremalPoints, 14);
+    sphere.radius = computeSphereRadius(sphere.center, pointArray, pointCount);
+
+    nvDebugCheck(allInside(sphere, pointArray, pointCount));
+
+    return sphere;
+}
+
+
+
diff --git a/thirdparty/thekla_atlas/nvmath/Sphere.h b/thirdparty/thekla_atlas/nvmath/Sphere.h
new file mode 100644
index 0000000000..300731af44
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Sphere.h
@@ -0,0 +1,43 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_SPHERE_H
+#define NV_MATH_SPHERE_H
+
+#include "Vector.h"
+
+namespace nv
+{
+    
+    class Sphere
+    {
+    public:
+        Sphere() {}
+        Sphere(Vector3::Arg center, float radius) : center(center), radius(radius) {}
+
+        Sphere(Vector3::Arg center) : center(center), radius(0.0f) {}
+        Sphere(Vector3::Arg p0, Vector3::Arg p1);
+        Sphere(Vector3::Arg p0, Vector3::Arg p1, Vector3::Arg p2);
+        Sphere(Vector3::Arg p0, Vector3::Arg p1, Vector3::Arg p2, Vector3::Arg p3);
+
+        Vector3 center;
+        float radius;
+    };
+
+    // Returns negative values if point is inside.
+    float distanceSquared(const Sphere & sphere, const Vector3 &point);
+
+
+    // Welz's algorithm. Fairly slow, recursive implementation uses large stack.
+    Sphere miniBall(const Vector3 * pointArray, uint pointCount);
+
+    Sphere approximateSphere_Ritter(const Vector3 * pointArray, uint pointCount);
+    Sphere approximateSphere_AABB(const Vector3 * pointArray, uint pointCount);
+    Sphere approximateSphere_EPOS6(const Vector3 * pointArray, uint pointCount);
+    Sphere approximateSphere_EPOS14(const Vector3 * pointArray, uint pointCount);
+
+
+} // nv namespace
+
+
+#endif // NV_MATH_SPHERE_H
diff --git a/thirdparty/thekla_atlas/nvmath/TypeSerialization.cpp b/thirdparty/thekla_atlas/nvmath/TypeSerialization.cpp
new file mode 100644
index 0000000000..72fa678f47
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/TypeSerialization.cpp
@@ -0,0 +1,54 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "TypeSerialization.h"
+
+#include "nvcore/Stream.h"
+
+#include "nvmath/Vector.h"
+#include "nvmath/Matrix.h"
+#include "nvmath/Quaternion.h"
+#include "nvmath/Basis.h"
+#include "nvmath/Box.h"
+#include "nvmath/Plane.inl"
+
+using namespace nv;
+
+Stream & nv::operator<< (Stream & s, Vector2 & v)
+{
+    return s << v.x << v.y;
+}
+
+Stream & nv::operator<< (Stream & s, Vector3 & v)
+{
+    return s << v.x << v.y << v.z;
+}
+
+Stream & nv::operator<< (Stream & s, Vector4 & v)
+{
+    return s << v.x << v.y << v.z << v.w;
+}
+
+Stream & nv::operator<< (Stream & s, Matrix & m)
+{
+    return s;
+}
+
+Stream & nv::operator<< (Stream & s, Quaternion & q)
+{
+    return s << q.x << q.y << q.z << q.w;
+}
+
+Stream & nv::operator<< (Stream & s, Basis & basis)
+{
+    return s << basis.tangent << basis.bitangent << basis.normal;
+}
+
+Stream & nv::operator<< (Stream & s, Box & box)
+{
+    return s << box.minCorner << box.maxCorner;
+}
+
+Stream & nv::operator<< (Stream & s, Plane & plane)
+{
+    return s << plane.v;
+}
diff --git a/thirdparty/thekla_atlas/nvmath/TypeSerialization.h b/thirdparty/thekla_atlas/nvmath/TypeSerialization.h
new file mode 100644
index 0000000000..32d6de827e
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/TypeSerialization.h
@@ -0,0 +1,35 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_TYPESERIALIZATION_H
+#define NV_MATH_TYPESERIALIZATION_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+    class Stream;
+
+    class Vector2;
+    class Vector3;
+    class Vector4;
+
+    class Matrix;
+    class Quaternion;
+    class Basis;
+    class Box;
+    class Plane;
+
+    NVMATH_API Stream & operator<< (Stream & s, Vector2 & obj);
+    NVMATH_API Stream & operator<< (Stream & s, Vector3 & obj);
+    NVMATH_API Stream & operator<< (Stream & s, Vector4 & obj);
+
+    NVMATH_API Stream & operator<< (Stream & s, Matrix & obj);
+    NVMATH_API Stream & operator<< (Stream & s, Quaternion & obj);
+    NVMATH_API Stream & operator<< (Stream & s, Basis & obj);
+    NVMATH_API Stream & operator<< (Stream & s, Box & obj);
+    NVMATH_API Stream & operator<< (Stream & s, Plane & obj);
+
+} // nv namespace
+
+#endif // NV_MATH_TYPESERIALIZATION_H
diff --git a/thirdparty/thekla_atlas/nvmath/Vector.cpp b/thirdparty/thekla_atlas/nvmath/Vector.cpp
new file mode 100644
index 0000000000..9122a1b0e9
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Vector.cpp
@@ -0,0 +1,4 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Vector.h"
+#include "Vector.inl"
diff --git a/thirdparty/thekla_atlas/nvmath/Vector.h b/thirdparty/thekla_atlas/nvmath/Vector.h
new file mode 100644
index 0000000000..ad18672a8a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Vector.h
@@ -0,0 +1,149 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_VECTOR_H
+#define NV_MATH_VECTOR_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+    class NVMATH_CLASS Vector2
+    {
+    public:
+        typedef Vector2 const & Arg;
+
+        Vector2();
+        explicit Vector2(float f);
+        Vector2(float x, float y);
+        Vector2(Vector2::Arg v);
+
+        //template <typename T> explicit Vector2(const T & v) : x(v.x), y(v.y) {}
+        //template <typename T> operator T() const { return T(x, y); }
+
+        const Vector2 & operator=(Vector2::Arg v);
+
+        const float * ptr() const;
+
+        void set(float x, float y);
+
+        Vector2 operator-() const;
+        void operator+=(Vector2::Arg v);
+        void operator-=(Vector2::Arg v);
+        void operator*=(float s);
+        void operator*=(Vector2::Arg v);
+
+        friend bool operator==(Vector2::Arg a, Vector2::Arg b);
+        friend bool operator!=(Vector2::Arg a, Vector2::Arg b);
+
+        union {
+            struct {
+                float x, y;
+            };
+            float component[2];
+        };
+    };
+
+    class NVMATH_CLASS Vector3
+    {
+    public:
+        typedef Vector3 const & Arg;
+
+        Vector3();
+        explicit Vector3(float x);
+        //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {}
+        Vector3(float x, float y, float z);
+        Vector3(Vector2::Arg v, float z);
+        Vector3(Vector3::Arg v);
+
+        //template <typename T> explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {}
+        //template <typename T> operator T() const { return T(x, y, z); }
+
+        const Vector3 & operator=(Vector3::Arg v);
+
+        Vector2 xy() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z);
+
+        Vector3 operator-() const;
+        void operator+=(Vector3::Arg v);
+        void operator-=(Vector3::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector3::Arg v);
+        void operator/=(Vector3::Arg v);
+
+        friend bool operator==(Vector3::Arg a, Vector3::Arg b);
+        friend bool operator!=(Vector3::Arg a, Vector3::Arg b);
+
+        union {
+            struct {
+                float x, y, z;
+            };
+            float component[3];
+        };
+    };
+
+    class NVMATH_CLASS Vector4
+    {
+    public:
+        typedef Vector4 const & Arg;
+
+        Vector4();
+        explicit Vector4(float x);
+        Vector4(float x, float y, float z, float w);
+        Vector4(Vector2::Arg v, float z, float w);
+        Vector4(Vector2::Arg v, Vector2::Arg u);
+        Vector4(Vector3::Arg v, float w);
+        Vector4(Vector4::Arg v);
+        //	Vector4(const Quaternion & v);
+
+        //template <typename T> explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+        //template <typename T> operator T() const { return T(x, y, z, w); }
+
+        const Vector4 & operator=(Vector4::Arg v);
+
+        Vector2 xy() const;
+        Vector2 zw() const;
+        Vector3 xyz() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z, float w);
+
+        Vector4 operator-() const;
+        void operator+=(Vector4::Arg v);
+        void operator-=(Vector4::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector4::Arg v);
+        void operator/=(Vector4::Arg v);
+
+        friend bool operator==(Vector4::Arg a, Vector4::Arg b);
+        friend bool operator!=(Vector4::Arg a, Vector4::Arg b);
+
+        union {
+            struct {
+                float x, y, z, w;
+            };
+            float component[4];
+        };
+    };
+
+} // nv namespace
+
+// If we had these functions, they would be ambiguous, the compiler would not know which one to pick:
+//template <typename T> Vector2 to(const T & v) { return Vector2(v.x, v.y); }
+//template <typename T> Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); }
+//template <typename T> Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); }
+
+// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages.
+
+// Instead we simply have explicit casts:
+template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
+template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }
+
+#endif // NV_MATH_VECTOR_H
diff --git a/thirdparty/thekla_atlas/nvmath/Vector.inl b/thirdparty/thekla_atlas/nvmath/Vector.inl
new file mode 100644
index 0000000000..bcaec7bf2a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/Vector.inl
@@ -0,0 +1,919 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_VECTOR_INL
+#define NV_MATH_VECTOR_INL
+
+#include "Vector.h"
+#include "nvcore/Utils.h" // min, max
+#include "nvcore/Hash.h" // hash
+
+namespace nv
+{
+
+    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
+    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
+
+
+    // Vector2
+    inline Vector2::Vector2() {}
+    inline Vector2::Vector2(float f) : x(f), y(f) {}
+    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
+    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
+
+    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        return *this;
+    }
+
+    inline const float * Vector2::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector2::set(float x, float y)
+    {
+        this->x = x;
+        this->y = y;
+    }
+
+    inline Vector2 Vector2::operator-() const
+    {
+        return Vector2(-x, -y);
+    }
+
+    inline void Vector2::operator+=(Vector2::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+    }
+
+    inline void Vector2::operator-=(Vector2::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+    }
+
+    inline void Vector2::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+    }
+
+    inline void Vector2::operator*=(Vector2::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+    }
+
+    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x == b.x && a.y == b.y; 
+    }
+    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x != b.x || a.y != b.y; 
+    }
+
+
+    // Vector3
+    inline Vector3::Vector3() {}
+    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
+    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
+    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
+    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
+
+    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        return *this;
+    }
+
+
+    inline Vector2 Vector3::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline const float * Vector3::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector3::set(float x, float y, float z)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+
+    inline Vector3 Vector3::operator-() const
+    {
+        return Vector3(-x, -y, -z);
+    }
+
+    inline void Vector3::operator+=(Vector3::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+    }
+
+    inline void Vector3::operator-=(Vector3::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+    }
+
+    inline void Vector3::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+    }
+
+    inline void Vector3::operator/=(float s)
+    {
+        float is = 1.0f / s;
+        x *= is;
+        y *= is;
+        z *= is;
+    }
+
+    inline void Vector3::operator*=(Vector3::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+    }
+
+    inline void Vector3::operator/=(Vector3::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+    }
+
+    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z; 
+    }
+    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z; 
+    }
+
+
+    // Vector4
+    inline Vector4::Vector4() {}
+    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
+    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
+    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
+    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+
+    inline const Vector4 & Vector4::operator=(const Vector4 & v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        w = v.w;
+        return *this;
+    }
+
+    inline Vector2 Vector4::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
+    inline Vector3 Vector4::xyz() const
+    {
+        return Vector3(x, y, z);
+    }
+
+    inline const float * Vector4::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector4::set(float x, float y, float z, float w)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+
+    inline Vector4 Vector4::operator-() const
+    {
+        return Vector4(-x, -y, -z, -w);
+    }
+
+    inline void Vector4::operator+=(Vector4::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+        w += v.w;
+    }
+
+    inline void Vector4::operator-=(Vector4::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+        w -= v.w;
+    }
+
+    inline void Vector4::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+        w *= s;
+    }
+
+    inline void Vector4::operator/=(float s)
+    {
+        x /= s;
+        y /= s;
+        z /= s;
+        w /= s;
+    }
+
+    inline void Vector4::operator*=(Vector4::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+        w *= v.w;
+    }
+
+    inline void Vector4::operator/=(Vector4::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+        w /= v.w;
+    }
+
+    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
+    }
+    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
+    }
+
+
+
+    // Functions
+
+
+    // Vector2
+
+    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x + b.x, a.y + b.y);
+    }
+    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x - b.x, a.y - b.y);
+    }
+    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, float s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
+    {
+        return Vector2(v.x * s.x, v.y * s.y);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
+    {
+        return Vector2(v1.x*v2.x, v1.y*v2.y);
+    }
+
+    inline Vector2 operator*(float s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
+    }
+
+    inline float dot(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x * b.x + a.y * b.y;
+    }
+
+    inline float lengthSquared(Vector2::Arg v)
+    {
+        return v.x * v.x + v.y * v.y;
+    }
+
+    inline float length(Vector2::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector2::Arg a, Vector2::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float inverseLength(Vector2::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector2 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
+    }
+
+    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(min(a.x, b.x), min(a.y, b.y));
+    }
+
+    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(max(a.x, b.x), max(a.y, b.y));
+    }
+
+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
+    {
+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
+    }
+
+    inline Vector2 saturate(Vector2::Arg v)
+    {
+        return Vector2(saturate(v.x), saturate(v.y));
+    }
+
+    inline bool isFinite(Vector2::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y);
+    }
+
+    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector2 vf = v;
+        nv::floatCleanup(vf.component, 2);
+        return vf;
+    }
+
+    // Note, this is the area scaled by 2!
+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
+    {
+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
+    }
+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+        // IC: While it may be appealing to use the following expression:
+        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
+
+        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
+        // numbers and the results becomes very unstable and dependent on the order of the factors.
+
+        // Instead, it's preferable to subtract the vertices first, and multiply the resulting small values together. The result
+        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
+        // the triangle.
+
+        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
+        return triangleArea(a-c, b-c);
+    }
+
+
+    template <>
+    inline uint hash(const Vector2 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 2, h);
+    }
+
+
+
+    // Vector3
+
+    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+    inline Vector3 add(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x + b, a.y + b, a.z + b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
+    {
+        return add(a, b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, float b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
+    }
+    inline Vector3 sub(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x - b, a.y - b, a.z - b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
+    {
+        return sub(a, b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, float b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, float s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
+    {
+        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(float s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
+    {
+        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
+    }*/
+
+    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
+    }
+
+    inline float dot(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z;
+    }
+
+    inline float lengthSquared(Vector3::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z;
+    }
+
+    inline float length(Vector3::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector3::Arg a, Vector3::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
+    {
+        return lengthSquared(a - b);
+    }
+
+    inline float inverseLength(Vector3::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector3 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
+    }
+
+    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+    }
+
+    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+    }
+
+    inline Vector3 clamp(Vector3::Arg v, float min, float max)
+    {
+        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
+    }
+
+    inline Vector3 saturate(Vector3::Arg v)
+    {
+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
+    }
+
+    inline Vector3 floor(Vector3::Arg v)
+    {
+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
+    }
+
+    inline Vector3 ceil(Vector3::Arg v)
+    {
+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
+    }
+
+    inline bool isFinite(Vector3::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
+    }
+
+    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector3 vf = v;
+        nv::floatCleanup(vf.component, 3);
+        return vf;
+    }
+
+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
+    {
+	    return v - (2 * dot(v, n)) * n;
+    }
+
+    template <>
+    inline uint hash(const Vector3 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 3, h);
+    }
+
+
+    // Vector4
+
+    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+    }
+    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+    }
+    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, float s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
+    {
+        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(float s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    {
+        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
+    }*/
+
+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
+    }
+
+    inline float dot(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+    }
+
+    inline float lengthSquared(Vector4::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+    }
+
+    inline float length(Vector4::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float inverseLength(Vector4::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector4 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
+    }
+
+    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+    }
+
+    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+    }
+
+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
+    {
+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
+    }
+
+    inline Vector4 saturate(Vector4::Arg v)
+    {
+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
+    }
+
+    inline bool isFinite(Vector4::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
+    }
+
+    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector4 vf = v;
+        nv::floatCleanup(vf.component, 4);
+        return vf;
+    }
+
+    template <>
+    inline uint hash(const Vector4 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 4, h);
+    }
+
+
+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
+
+    //int:
+
+    inline Vector2 scale(Vector2::Arg v, int s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(int s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, int s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(int s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, int s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(int s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    //double:
+
+    inline Vector3 operator*(Vector3::Arg v, double s)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator*(double s, Vector3::Arg v)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, double s)
+    {
+        return scale(v, 1.f/((float)s));
+    }    
+        
+#endif //NV_OS_IOS
+
+} // nv namespace
+
+#endif // NV_MATH_VECTOR_INL
diff --git a/thirdparty/thekla_atlas/nvmath/ftoi.h b/thirdparty/thekla_atlas/nvmath/ftoi.h
new file mode 100644
index 0000000000..182c56d1c3
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/ftoi.h
@@ -0,0 +1,261 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MATH_FTOI_H
+#define NV_MATH_FTOI_H
+
+#include "nvmath/nvmath.h"
+
+#include <math.h>
+
+namespace nv
+{
+    // Optimized float to int conversions. See:
+    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
+    // http://www.stereopsis.com/sree/fpu2006.html
+    // http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/
+    // http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point
+
+
+    union DoubleAnd64 {
+        uint64    i;
+        double    d;
+    };
+
+    static const double floatutil_xs_doublemagic = (6755399441055744.0);                            // 2^52 * 1.5
+    static const double floatutil_xs_doublemagicdelta = (1.5e-8);                                   // almost .5f = .5f + 1e^(number of exp bit)
+    static const double floatutil_xs_doublemagicroundeps = (0.5f - floatutil_xs_doublemagicdelta);  // almost .5f = .5f - 1e^(number of exp bit)
+
+    NV_FORCEINLINE int ftoi_round_xs(double val, double magic) {
+#if 1
+        DoubleAnd64 dunion;
+        dunion.d = val + magic;
+        return (int32) dunion.i; // just cast to grab the bottom bits
+#else
+        val += magic;
+        return ((int*)&val)[0]; // @@ Assumes little endian.
+#endif
+    }
+
+    NV_FORCEINLINE int ftoi_round_xs(float val) {
+        return ftoi_round_xs(val, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_floor_xs(float val) {
+        return ftoi_round_xs(val - floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_ceil_xs(float val) {
+        return ftoi_round_xs(val + floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc_xs(float val) {
+        return (val<0) ? ftoi_ceil_xs(val) : ftoi_floor_xs(val);
+    }
+
+// -- GODOT start --
+//#if NV_CPU_X86 || NV_CPU_X86_64
+#if NV_USE_SSE
+// -- GODOT end --
+
+    NV_FORCEINLINE int ftoi_round_sse(float f) {
+        return _mm_cvt_ss2si(_mm_set_ss(f));
+    }
+
+    NV_FORCEINLINE int ftoi_trunc_sse(float f) {
+      return _mm_cvtt_ss2si(_mm_set_ss(f));
+    }
+
+#endif
+
+
+
+#if NV_USE_SSE
+
+    NV_FORCEINLINE int ftoi_round(float val) {
+        return ftoi_round_sse(val);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc(float f) {
+      return ftoi_trunc_sse(f);
+    }
+
+    // We can probably do better than this. See for example:
+    // http://dss.stephanierct.com/DevBlog/?p=8
+    NV_FORCEINLINE int ftoi_floor(float val) {
+        return ftoi_round(floorf(val));
+    }
+
+    NV_FORCEINLINE int ftoi_ceil(float val) {
+        return ftoi_round(ceilf(val));
+    }
+
+#else
+
+    // In theory this should work with any double floating point math implementation, but it appears that MSVC produces incorrect code
+    // when SSE2 is targeted and fast math is enabled (/arch:SSE2 & /fp:fast). These problems go away with /fp:precise, which is the default mode.
+
+    NV_FORCEINLINE int ftoi_round(float val) {
+        return ftoi_round_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_floor(float val) {
+        return ftoi_floor_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_ceil(float val) {
+        return ftoi_ceil_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc(float f) {
+      return ftoi_trunc_xs(f);
+    }
+
+#endif
+
+
+    inline void test_ftoi() {
+
+        // Round to nearest integer.
+        nvCheck(ftoi_round(0.1f) == 0);
+        nvCheck(ftoi_round(0.6f) == 1);
+        nvCheck(ftoi_round(-0.2f) == 0);
+        nvCheck(ftoi_round(-0.7f) == -1);
+        nvCheck(ftoi_round(10.1f) == 10);
+        nvCheck(ftoi_round(10.6f) == 11);
+        nvCheck(ftoi_round(-90.1f) == -90);
+        nvCheck(ftoi_round(-90.6f) == -91);
+
+        nvCheck(ftoi_round(0) == 0);
+        nvCheck(ftoi_round(1) == 1);
+        nvCheck(ftoi_round(-1) == -1);
+        
+        nvCheck(ftoi_round(0.5f) == 0);  // How are midpoints rounded? Bankers rounding.
+        nvCheck(ftoi_round(1.5f) == 2);
+        nvCheck(ftoi_round(2.5f) == 2);
+        nvCheck(ftoi_round(3.5f) == 4);
+        nvCheck(ftoi_round(4.5f) == 4);
+        nvCheck(ftoi_round(-0.5f) == 0);
+        nvCheck(ftoi_round(-1.5f) == -2);
+                
+
+        // Truncation (round down if > 0, round up if < 0).
+        nvCheck(ftoi_trunc(0.1f) == 0);
+        nvCheck(ftoi_trunc(0.6f) == 0);
+        nvCheck(ftoi_trunc(-0.2f) == 0);
+        nvCheck(ftoi_trunc(-0.7f) == 0);    // @@ When using /arch:SSE2 in Win32, msvc produce wrong code for this one. It is skipping the addition.
+        nvCheck(ftoi_trunc(1.99f) == 1);
+        nvCheck(ftoi_trunc(-1.2f) == -1);
+
+        // Floor (round down).
+        nvCheck(ftoi_floor(0.1f) == 0);
+        nvCheck(ftoi_floor(0.6f) == 0);
+        nvCheck(ftoi_floor(-0.2f) == -1);
+        nvCheck(ftoi_floor(-0.7f) == -1);
+        nvCheck(ftoi_floor(1.99f) == 1);
+        nvCheck(ftoi_floor(-1.2f) == -2);
+
+        nvCheck(ftoi_floor(0) == 0);
+        nvCheck(ftoi_floor(1) == 1);
+        nvCheck(ftoi_floor(-1) == -1);
+        nvCheck(ftoi_floor(2) == 2);
+        nvCheck(ftoi_floor(-2) == -2);
+
+        // Ceil (round up).
+        nvCheck(ftoi_ceil(0.1f) == 1);
+        nvCheck(ftoi_ceil(0.6f) == 1);
+        nvCheck(ftoi_ceil(-0.2f) == 0);
+        nvCheck(ftoi_ceil(-0.7f) == 0);
+        nvCheck(ftoi_ceil(1.99f) == 2);
+        nvCheck(ftoi_ceil(-1.2f) == -1);
+
+        nvCheck(ftoi_ceil(0) == 0);
+        nvCheck(ftoi_ceil(1) == 1);
+        nvCheck(ftoi_ceil(-1) == -1);
+        nvCheck(ftoi_ceil(2) == 2);
+        nvCheck(ftoi_ceil(-2) == -2);
+    }
+
+
+
+
+
+    // Safe versions using standard casts.
+
+    inline int iround(float f)
+    {
+        return ftoi_round(f);
+        //return int(floorf(f + 0.5f));
+    }
+
+    inline int iround(double f)
+    {
+        return int(::floor(f + 0.5));
+    }
+
+    inline int ifloor(float f)
+    {
+        return ftoi_floor(f);
+        //return int(floorf(f));
+    }
+
+    inline int iceil(float f)
+    {
+        return int(ceilf(f));
+    }
+
+
+
+    // I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule.
+    // Some notes from cbloom: http://cbloomrants.blogspot.com/2011/07/07-26-11-pixel-int-to-float-options.html
+
+    // Quantize a float in the [0,1] range, using exact end points or uniform bins.
+    inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) {
+        nvDebugCheck(bits <= 16);
+
+        float range = float(1 << bits);
+        if (exactEndPoints) {
+            return floorf(x * (range-1) + 0.5f) / (range-1);
+        }
+        else {
+            return (floorf(x * range) + 0.5f) / range;
+        }
+    }
+
+
+    // This is the most common rounding mode:
+    // 
+    //   0     1       2     3
+    // |___|_______|_______|___|
+    // 0                       1
+    //
+    // You get that if you take the unit floating point number multiply by 'N-1' and round to nearest. That is, `i = round(f * (N-1))`.
+    // You reconstruct the original float dividing by 'N-1': `f = i / (N-1)`
+
+
+    //    0     1     2     3
+    // |_____|_____|_____|_____|
+    // 0                       1
+
+    /*enum BinningMode {
+        RoundMode_ExactEndPoints,       
+        RoundMode_UniformBins,
+    };*/
+
+    template <int N>
+    inline uint unitFloatToFixed(float f) {
+        return ftoi_round(f * ((1<<N)-1));
+    }
+
+    inline uint8 unitFloatToFixed8(float f) {
+        return (uint8)unitFloatToFixed<8>(f);
+    }
+
+    inline uint16 unitFloatToFixed16(float f) {
+        return (uint16)unitFloatToFixed<16>(f);
+    }
+
+
+} // nv
+
+#endif // NV_MATH_FTOI_H
diff --git a/thirdparty/thekla_atlas/nvmath/nvmath.h b/thirdparty/thekla_atlas/nvmath/nvmath.h
new file mode 100644
index 0000000000..f2b69426e1
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmath/nvmath.h
@@ -0,0 +1,346 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_H
+#define NV_MATH_H
+
+#include "nvcore/nvcore.h"
+#include "nvcore/Debug.h"   // nvDebugCheck
+#include "nvcore/Utils.h"   // max, clamp
+
+#include <math.h>
+
+#if NV_OS_WIN32 || NV_OS_XBOX || NV_OS_DURANGO
+#include <float.h>  // finite, isnan
+#endif
+
+// -- GODOT start --
+//#if NV_CPU_X86 || NV_CPU_X86_64
+//    //#include <intrin.h>
+//    #include <xmmintrin.h>
+//#endif
+// -- GODOT end --
+
+
+
+// Function linkage
+#if NVMATH_SHARED
+#ifdef NVMATH_EXPORTS
+#define NVMATH_API DLL_EXPORT
+#define NVMATH_CLASS DLL_EXPORT_CLASS
+#else
+#define NVMATH_API DLL_IMPORT
+#define NVMATH_CLASS DLL_IMPORT
+#endif
+#else // NVMATH_SHARED
+#define NVMATH_API
+#define NVMATH_CLASS
+#endif // NVMATH_SHARED
+
+// Set some reasonable defaults.
+#ifndef NV_USE_ALTIVEC
+#   define NV_USE_ALTIVEC NV_CPU_PPC
+//#   define NV_USE_ALTIVEC defined(__VEC__)
+#endif
+
+#ifndef NV_USE_SSE
+#   if NV_CPU_X86_64
+        // x64 always supports at least SSE2
+#       define NV_USE_SSE 2
+#   elif NV_CC_MSVC && defined(_M_IX86_FP)
+        // Also on x86 with the /arch:SSE flag in MSVC.
+#       define NV_USE_SSE _M_IX86_FP       // 1=SSE, 2=SS2
+#   elif defined(__SSE__)
+#       define NV_USE_SSE 1
+#   elif defined(__SSE2__)
+#       define NV_USE_SSE 2
+#   else
+        // Otherwise we assume no SSE.
+#       define NV_USE_SSE 0
+#   endif
+#endif
+
+
+// Internally set NV_USE_SIMD when either altivec or sse is available.
+#if NV_USE_ALTIVEC && NV_USE_SSE
+#	error "Cannot enable both altivec and sse!"
+#endif
+
+
+// -- GODOT start --
+#if NV_USE_SSE
+    //#include <intrin.h>
+    #include <xmmintrin.h>
+#endif
+// -- GODOT end --
+
+
+#ifndef PI
+#define PI                  float(3.1415926535897932384626433833)
+#endif
+
+#define NV_EPSILON          (0.0001f)
+#define NV_NORMAL_EPSILON   (0.001f)
+
+/*
+#define SQ(r)               ((r)*(r))
+
+#define SIGN_BITMASK        0x80000000
+
+/// Integer representation of a floating-point value.
+#define IR(x)               ((uint32 &)(x))
+
+/// Absolute integer representation of a floating-point value
+#define AIR(x)              (IR(x) & 0x7fffffff)
+
+/// Floating-point representation of an integer value.
+#define FR(x)               ((float&)(x))
+
+/// Integer-based comparison of a floating point value.
+/// Don't use it blindly, it can be faster or slower than the FPU comparison, depends on the context.
+#define IS_NEGATIVE_FLOAT(x) (IR(x)&SIGN_BITMASK)
+*/
+
+extern "C" inline double sqrt_assert(const double f)
+{
+    nvDebugCheck(f >= 0.0f);
+    return sqrt(f);
+}
+
+inline float sqrtf_assert(const float f)
+{
+    nvDebugCheck(f >= 0.0f);
+    return sqrtf(f);
+}
+
+extern "C" inline double acos_assert(const double f) 
+{
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return acos(f);
+}
+
+inline float acosf_assert(const float f)
+{
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return acosf(f);
+}
+
+extern "C" inline double asin_assert(const double f)
+{
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return asin(f);
+}
+
+inline float asinf_assert(const float f)
+{
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return asinf(f);
+}
+
+// Replace default functions with asserting ones.
+#if !NV_CC_MSVC || (NV_CC_MSVC && (_MSC_VER < 1700))    // IC: Apparently this was causing problems in Visual Studio 2012. See Issue 194: https://code.google.com/p/nvidia-texture-tools/issues/detail?id=194
+#define sqrt sqrt_assert
+#define sqrtf sqrtf_assert
+#define acos acos_assert
+#define acosf acosf_assert
+#define asin asin_assert
+#define asinf asinf_assert
+#endif
+
+#if NV_CC_MSVC
+NV_FORCEINLINE float log2f(float x)
+{
+    nvCheck(x >= 0);
+    return logf(x) / logf(2.0f);
+}
+NV_FORCEINLINE float exp2f(float x)
+{
+    return powf(2.0f, x);
+}
+#endif
+
+namespace nv
+{
+    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
+    inline float toDegree(float radian) { return radian * (180.0f / PI); }
+
+    // Robust floating point comparisons:
+    // http://realtimecollisiondetection.net/blog/?p=89
+    inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
+    {
+        //return fabs(f0-f1) <= epsilon;
+        return fabs(f0-f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
+    }
+
+    inline bool isZero(const float f, const float epsilon = NV_EPSILON)
+    {
+        return fabs(f) <= epsilon;
+    }
+
+    inline bool isFinite(const float f)
+    {
+#if NV_OS_WIN32 || NV_OS_XBOX || NV_OS_DURANGO
+        return _finite(f) != 0;
+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD || NV_OS_ORBIS
+        return isfinite(f);
+#elif NV_OS_LINUX
+        return finitef(f);
+#else
+#   error "isFinite not supported"
+#endif
+        //return std::isfinite (f);
+        //return finite (f);
+    }
+
+    inline bool isNan(const float f)
+    {
+#if NV_OS_WIN32 || NV_OS_XBOX || NV_OS_DURANGO
+        return _isnan(f) != 0;
+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD || NV_OS_ORBIS
+        return isnan(f);
+#elif NV_OS_LINUX
+        return isnanf(f);
+#else
+#   error "isNan not supported"
+#endif
+    }
+
+    inline uint log2(uint32 i)
+    {
+        uint32 value = 0;
+        while( i >>= 1 ) value++;
+        return value;
+    }
+
+    inline uint log2(uint64 i)
+    {
+        uint64 value = 0;
+        while (i >>= 1) value++;
+        return U32(value);
+    }
+
+    inline float lerp(float f0, float f1, float t)
+    {
+        const float s = 1.0f - t;
+        return f0 * s + f1 * t;
+    }
+
+    inline float square(float f) { return f * f; }
+    inline int square(int i) { return i * i; }
+
+    inline float cube(float f) { return f * f * f; }
+    inline int cube(int i) { return i * i * i; }
+
+    inline float frac(float f)
+    {
+        return f - floor(f);
+    }
+
+    inline float floatRound(float f)
+    {
+        return floorf(f + 0.5f);
+    }
+
+    // Eliminates negative zeros from a float array.
+    inline void floatCleanup(float * fp, int n)
+    {
+        for (int i = 0; i < n; i++) {
+            //nvDebugCheck(isFinite(fp[i]));
+            union { float f; uint32 i; } x = { fp[i] };
+            if (x.i == 0x80000000) fp[i] = 0.0f;
+        }
+    }
+
+    inline float saturate(float f) {
+        return clamp(f, 0.0f, 1.0f);
+    }
+
+    inline float linearstep(float edge0, float edge1, float x) {
+        // Scale, bias and saturate x to 0..1 range
+        return saturate((x - edge0) / (edge1 - edge0));
+    }
+
+    inline float smoothstep(float edge0, float edge1, float x) {
+        x = linearstep(edge0, edge1, x); 
+
+        // Evaluate polynomial
+        return x*x*(3 - 2*x);
+    }
+
+    inline int sign(float a)
+    {
+        return (a > 0) - (a < 0);
+        //if (a > 0.0f) return 1;
+        //if (a < 0.0f) return -1;
+        //return 0;
+    }
+
+    union Float754 {
+        unsigned int raw;
+        float value;
+        struct {
+        #if NV_BIG_ENDIAN
+            unsigned int negative:1;
+            unsigned int biasedexponent:8;
+            unsigned int mantissa:23;
+        #else
+            unsigned int mantissa:23;
+            unsigned int biasedexponent:8;
+            unsigned int negative:1;
+        #endif
+        } field;
+    };
+
+    // Return the exponent of x ~ Floor(Log2(x))
+    inline int floatExponent(float x)
+    {
+        Float754 f;
+        f.value = x;
+        return (f.field.biasedexponent - 127);
+    }
+
+
+    // FloatRGB9E5
+    union Float3SE {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 e : 5;
+            uint32 zm : 9;
+            uint32 ym : 9;
+            uint32 xm : 9;
+        #else
+            uint32 xm : 9;
+            uint32 ym : 9;
+            uint32 zm : 9;
+            uint32 e : 5;
+        #endif
+        };
+    };
+
+    // FloatR11G11B10
+    union Float3PK {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 ze : 5;
+            uint32 zm : 5;
+            uint32 ye : 5;
+            uint32 ym : 6;
+            uint32 xe : 5;
+            uint32 xm : 6;
+        #else
+            uint32 xm : 6;
+            uint32 xe : 5;
+            uint32 ym : 6;
+            uint32 ye : 5;
+            uint32 zm : 5;
+            uint32 ze : 5;
+        #endif
+        };
+    };
+
+
+} // nv
+
+#endif // NV_MATH_H
diff --git a/thirdparty/thekla_atlas/nvmesh/BaseMesh.cpp b/thirdparty/thekla_atlas/nvmesh/BaseMesh.cpp
new file mode 100644
index 0000000000..f17d3b46fd
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/BaseMesh.cpp
@@ -0,0 +1,19 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "BaseMesh.h"
+#include "Stream.h"
+#include "nvmath/TypeSerialization.h"
+
+
+namespace nv
+{
+	static Stream & operator<< (Stream & s, BaseMesh::Vertex & vertex)
+	{
+		return s << vertex.id << vertex.pos << vertex.nor << vertex.tex;
+	}
+
+	Stream & operator<< (Stream & s, BaseMesh & mesh)
+	{
+		return s << mesh.m_vertexArray;
+	}
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/BaseMesh.h b/thirdparty/thekla_atlas/nvmesh/BaseMesh.h
new file mode 100644
index 0000000000..c8559511f1
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/BaseMesh.h
@@ -0,0 +1,72 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MESH_BASEMESH_H
+#define NV_MESH_BASEMESH_H
+
+#include "nvmesh.h"
+#include "nvmath/Vector.h"
+#include "nvcore/Array.h"
+#include "nvcore/Hash.h"
+
+namespace nv
+{
+
+    /// Base mesh without connectivity.
+    class BaseMesh
+    {
+    public:
+        struct Vertex;
+
+        BaseMesh() {}
+
+        BaseMesh(uint vertexNum) :
+            m_vertexArray(vertexNum) {}
+
+        // Vertex methods.
+        uint vertexCount() const { return m_vertexArray.count(); }
+        const Vertex & vertexAt(uint i) const { return m_vertexArray[i]; }
+        Vertex & vertexAt(uint i) { return m_vertexArray[i]; }
+        const Array<Vertex> & vertices() const { return m_vertexArray; }
+        Array<Vertex> & vertices() { return m_vertexArray; }
+
+        friend Stream & operator<< (Stream & s, BaseMesh & obj);
+
+    protected:
+
+        Array<Vertex> m_vertexArray;
+    };
+
+
+    /// BaseMesh vertex.
+    struct BaseMesh::Vertex
+    {
+        Vertex() : id(NIL), pos(0.0f), nor(0.0f), tex(0.0f) {}
+
+        uint id;		// @@ Vertex should be an index into the vertex data.
+        Vector3 pos;
+        Vector3 nor;
+        Vector2 tex;
+    };
+
+    inline bool operator==(const BaseMesh::Vertex & a, const BaseMesh::Vertex & b)
+    {
+        return a.pos == b.pos && a.nor == b.nor && a.tex == b.tex;
+    }
+
+    inline bool operator!=(const BaseMesh::Vertex & a, const BaseMesh::Vertex & b)
+    {
+        return a.pos != b.pos && a.nor != b.nor && a.tex != b.tex;
+    }
+
+    template <> struct Hash<BaseMesh::Vertex>
+    {
+        uint operator()(const BaseMesh::Vertex & v) const
+        {
+            return Hash<Vector3>()(v.pos);
+        }
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_BASEMESH_H
diff --git a/thirdparty/thekla_atlas/nvmesh/MeshBuilder.cpp b/thirdparty/thekla_atlas/nvmesh/MeshBuilder.cpp
new file mode 100644
index 0000000000..24d8ddff89
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/MeshBuilder.cpp
@@ -0,0 +1,1000 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "nvmesh.h" // pch
+
+#include "MeshBuilder.h"
+#include "TriMesh.h"
+#include "QuadTriMesh.h"
+#include "halfedge/Mesh.h"
+#include "halfedge/Vertex.h"
+#include "halfedge/Face.h"
+
+#include "weld/Weld.h"
+
+#include "nvmath/Box.h"
+#include "nvmath/Vector.inl"
+
+#include "nvcore/StrLib.h"
+#include "nvcore/RadixSort.h"
+#include "nvcore/Ptr.h"
+#include "nvcore/Array.inl"
+#include "nvcore/HashMap.inl"
+
+
+using namespace nv;
+
+/*
+By default the mesh builder creates 3 streams (position, normal, texcoord), I'm planning to add support for extra streams as follows:
+
+enum StreamType { StreamType_Float, StreamType_Vector2, StreamType_Vector3, StreamType_Vector4 };
+
+uint addStream(const char *, uint idx, StreamType);
+
+uint addAttribute(float)
+uint addAttribute(Vector2)
+uint addAttribute(Vector3)
+uint addAttribute(Vector4)
+
+struct Vertex
+{
+    uint pos;
+    uint nor;
+    uint tex;
+    uint * attribs;	// NULL or NIL terminated array?
+};
+
+All streams must be added before hand, so that you know the size of the attribs array.
+
+The vertex hash function could be kept as is, but the == operator should be extended to test 
+the extra atributes when available.
+
+That might require a custom hash implementation, or an extension of the current one. How to
+handle the variable number of attributes in the attribs array?
+
+bool operator()(const Vertex & a, const Vertex & b) const
+{ 
+    if (a.pos != b.pos || a.nor != b.nor || a.tex != b.tex) return false;
+    if (a.attribs == NULL && b.attribs == NULL) return true;
+    return 0 == memcmp(a.attribs, b.attribs, ???);
+}
+
+We could use a NIL terminated array, or provide custom user data to the equals functor.
+
+vertexMap.setUserData((void *)vertexAttribCount);
+
+bool operator()(const Vertex & a, const Vertex & b, void * userData) const { ... }
+
+*/
+
+
+
+namespace 
+{
+    struct Material
+    {
+        Material() : faceCount(0) {}
+        Material(const String & str) : name(str), faceCount(0) {}
+
+        String name;
+        uint faceCount;
+    };
+
+    struct Vertex
+    {
+        //Vertex() {}
+        //Vertex(uint p, uint n, uint t0, uint t1, uint c) : pos(p), nor(n), tex0(t0), tex1(t1), col(c) {}
+
+        friend bool operator==(const Vertex & a, const Vertex & b)
+        {
+            return a.pos == b.pos && a.nor == b.nor && a.tex[0] == b.tex[0] && a.tex[1] == b.tex[1] && a.col[0] == b.col[0] && a.col[1] == b.col[1] && a.col[2] == b.col[2];
+        }
+
+        uint pos;
+        uint nor;
+        uint tex[2];
+        uint col[3];
+    };
+
+    struct Face
+    {
+        uint id;
+        uint firstIndex;
+        uint indexCount;
+        uint material;
+        uint group;
+    };
+
+} // namespace
+
+
+namespace nv
+{
+    // This is a much better hash than the default and greatly improves performance!
+    template <> struct Hash<Vertex>
+    {
+        uint operator()(const Vertex & v) const { return v.pos + v.nor + v.tex[0]/* + v.col*/; }
+    };
+}
+
+struct MeshBuilder::PrivateData
+{
+    PrivateData() : currentGroup(NIL), currentMaterial(NIL), maxFaceIndexCount(0) {}
+
+    uint pushVertex(uint p, uint n, uint t0, uint t1, uint c0, uint c1, uint c2);
+    uint pushVertex(const Vertex & v);
+
+    Array<Vector3> posArray;
+    Array<Vector3> norArray;
+    Array<Vector2> texArray[2];
+    Array<Vector4> colArray[3];
+
+    Array<Vertex> vertexArray;
+    HashMap<Vertex, uint> vertexMap;
+
+    HashMap<String, uint> materialMap;
+    Array<Material> materialArray;
+
+    uint currentGroup;
+    uint currentMaterial;
+
+    Array<uint> indexArray;
+    Array<Face> faceArray;
+
+    uint maxFaceIndexCount;
+};
+
+
+uint MeshBuilder::PrivateData::pushVertex(uint p, uint n, uint t0, uint t1, uint c0, uint c1, uint c2)
+{
+    Vertex v;
+    v.pos = p;
+    v.nor = n;
+    v.tex[0] = t0;
+    v.tex[1] = t1;
+    v.col[0] = c0;
+    v.col[1] = c1;
+    v.col[2] = c2;
+    return pushVertex(v);
+}
+
+uint MeshBuilder::PrivateData::pushVertex(const Vertex & v)
+{
+    // Lookup vertex v in map.
+    uint idx;
+    if (vertexMap.get(v, &idx))
+    {
+        return idx;
+    }
+
+    idx = vertexArray.count();
+    vertexArray.pushBack(v);
+    vertexMap.add(v, idx);
+
+    return idx;
+}
+
+
+MeshBuilder::MeshBuilder() : d(new PrivateData())
+{
+}
+
+MeshBuilder::~MeshBuilder()
+{
+    nvDebugCheck(d != NULL);
+    delete d;
+}
+
+
+// Builder methods.
+uint MeshBuilder::addPosition(const Vector3 & v)
+{
+    d->posArray.pushBack(validate(v));
+    return d->posArray.count() - 1;
+}
+
+uint MeshBuilder::addNormal(const Vector3 & v)
+{
+    d->norArray.pushBack(validate(v));
+    return d->norArray.count() - 1;
+}
+
+uint MeshBuilder::addTexCoord(const Vector2 & v, uint set/*=0*/)
+{
+    d->texArray[set].pushBack(validate(v));
+    return d->texArray[set].count() - 1;
+}
+
+uint MeshBuilder::addColor(const Vector4 & v, uint set/*=0*/)
+{
+    d->colArray[set].pushBack(validate(v));
+    return d->colArray[set].count() - 1;
+}
+
+void MeshBuilder::beginGroup(uint id)
+{
+    d->currentGroup = id;
+}
+
+void MeshBuilder::endGroup()
+{
+    d->currentGroup = NIL;
+}
+
+// Add named material, check for uniquenes.
+uint MeshBuilder::addMaterial(const char * name)
+{
+    uint index;
+    if (d->materialMap.get(name, &index)) {
+        nvDebugCheck(d->materialArray[index].name == name);
+    }
+    else {
+        index = d->materialArray.count();
+        d->materialMap.add(name, index);
+        
+        Material material(name);
+        d->materialArray.append(material);
+    }
+    return index;
+}
+
+void MeshBuilder::beginMaterial(uint id)
+{
+    d->currentMaterial = id;
+}
+
+void MeshBuilder::endMaterial()
+{
+    d->currentMaterial = NIL;
+}
+
+void MeshBuilder::beginPolygon(uint id/*=0*/)
+{
+    Face face;
+    face.id = id;
+    face.firstIndex = d->indexArray.count();
+    face.indexCount = 0;
+    face.material = d->currentMaterial;
+    face.group = d->currentGroup;
+
+    d->faceArray.pushBack(face);
+}
+
+uint MeshBuilder::addVertex(uint p, uint n/*= NIL*/, uint t0/*= NIL*/, uint t1/*= NIL*/, uint c0/*= NIL*/, uint c1/*= NIL*/, uint c2/*= NIL*/)
+{
+    // @@ In theory there's no need to add vertices before faces, but I'm adding this to debug problems in our maya exporter:
+    nvDebugCheck(p < d->posArray.count());
+    nvDebugCheck(n == NIL || n < d->norArray.count());
+    nvDebugCheck(t0 == NIL || t0 < d->texArray[0].count());
+    nvDebugCheck(t1 == NIL || t1 < d->texArray[1].count());
+    //nvDebugCheck(c0 == NIL || c0 < d->colArray[0].count());
+    if (c0 > d->colArray[0].count()) c0 = NIL;    // @@ This seems to be happening in loc_swamp_catwalk.mb! No idea why.
+    nvDebugCheck(c1 == NIL || c1 < d->colArray[1].count());
+    nvDebugCheck(c2 == NIL || c2 < d->colArray[2].count());
+
+    uint idx = d->pushVertex(p, n, t0, t1, c0, c1, c2);
+    d->indexArray.pushBack(idx);
+    d->faceArray.back().indexCount++;
+    return idx;
+}
+
+uint MeshBuilder::addVertex(const Vector3 & pos)
+{
+    uint p = addPosition(pos);
+    return addVertex(p);
+}
+
+#if 0
+uint MeshBuilder::addVertex(const Vector3 & pos, const Vector3 & nor, const Vector2 & tex0, const Vector2 & tex1, const Vector4 & col0, const Vector4 & col1)
+{
+    uint p = addPosition(pos);
+    uint n = addNormal(nor);
+    uint t0 = addTexCoord(tex0, 0);
+    uint t1 = addTexCoord(tex1, 1);
+    uint c0 = addColor(col0);
+    uint c1 = addColor(col1);
+    return addVertex(p, n, t0, t1, c0, c1);
+}
+#endif
+
+// Return true if the face is valid and was added to the mesh.
+bool MeshBuilder::endPolygon()
+{
+    const Face & face = d->faceArray.back();
+    const uint count = face.indexCount;
+
+    // Validate polygon here.
+    bool invalid = count <= 2;
+
+    if (!invalid) {
+        // Skip zero area polygons. Or polygons with degenerate edges (which will result in zero-area triangles).
+        const uint first = face.firstIndex;
+        for (uint j = count - 1, i = 0; i < count; j = i, i++) {
+            uint v0 = d->indexArray[first + i];
+            uint v1 = d->indexArray[first + j];
+
+            uint p0 = d->vertexArray[v0].pos;
+            uint p1 = d->vertexArray[v1].pos;
+
+            if (p0 == p1) {
+                invalid = true;
+                break;
+            }
+
+            if (equal(d->posArray[p0], d->posArray[p1], FLT_EPSILON)) {
+                invalid = true;
+                break;
+            }
+        }
+
+        uint v0 = d->indexArray[first];
+        uint p0 = d->vertexArray[v0].pos;
+        Vector3 x0 = d->posArray[p0];
+
+        float area = 0.0f;
+        for (uint j = 1, i = 2; i < count; j = i, i++) {
+            uint v1 = d->indexArray[first + i];
+            uint v2 = d->indexArray[first + j];
+
+            uint p1 = d->vertexArray[v1].pos;
+            uint p2 = d->vertexArray[v2].pos;
+
+            Vector3 x1 = d->posArray[p1];
+            Vector3 x2 = d->posArray[p2];
+
+            area += length(cross(x1-x0, x2-x0));
+        }
+
+        if (0.5 * area < 1e-6) {    // Reduce this threshold if artists have legitimate complains.
+            invalid = true;
+        }
+
+        // @@ This is not complete. We may still get zero area triangles after triangulation.
+        // However, our plugin triangulates before building the mesh, so hopefully that's not a problem.
+
+    }
+
+    if (invalid)
+    {
+        d->indexArray.resize(d->indexArray.size() - count);
+        d->faceArray.popBack();
+        return false;
+    }
+    else
+    {
+        if (d->currentMaterial != NIL) {
+            d->materialArray[d->currentMaterial].faceCount++;
+        }
+
+        d->maxFaceIndexCount = max(d->maxFaceIndexCount, count);
+        return true;
+    }
+}
+
+
+uint MeshBuilder::weldPositions()
+{
+    Array<uint> xrefs;
+    Weld<Vector3> weldVector3;
+
+    if (d->posArray.count()) {
+        // Weld vertex attributes.
+        weldVector3(d->posArray, xrefs);
+
+        // Remap vertex indices.
+        const uint vertexCount = d->vertexArray.count();
+        for (uint v = 0; v < vertexCount; v++)
+        {
+            Vertex & vertex = d->vertexArray[v];
+            if (vertex.pos != NIL) vertex.pos = xrefs[vertex.pos];
+        }
+    }
+
+    return d->posArray.count();
+}
+
+uint MeshBuilder::weldNormals()
+{
+    Array<uint> xrefs;
+    Weld<Vector3> weldVector3;
+
+    if (d->norArray.count()) {
+        // Weld vertex attributes.
+        weldVector3(d->norArray, xrefs);
+
+        // Remap vertex indices.
+        const uint vertexCount = d->vertexArray.count();
+        for (uint v = 0; v < vertexCount; v++)
+        {
+            Vertex & vertex = d->vertexArray[v];
+            if (vertex.nor != NIL) vertex.nor = xrefs[vertex.nor];
+        }
+    }
+
+    return d->norArray.count();
+}
+
+uint MeshBuilder::weldTexCoords(uint set/*=0*/)
+{
+    Array<uint> xrefs;
+    Weld<Vector2> weldVector2;
+
+    if (d->texArray[set].count()) {
+        // Weld vertex attributes.
+        weldVector2(d->texArray[set], xrefs);
+
+        // Remap vertex indices.
+        const uint vertexCount = d->vertexArray.count();
+        for (uint v = 0; v < vertexCount; v++)
+        {
+            Vertex & vertex = d->vertexArray[v];
+            if (vertex.tex[set] != NIL) vertex.tex[set] = xrefs[vertex.tex[set]];
+        }
+    }
+
+    return d->texArray[set].count();
+}
+
+uint  MeshBuilder::weldColors(uint set/*=0*/)
+{
+    Array<uint> xrefs;
+    Weld<Vector4> weldVector4;
+
+    if (d->colArray[set].count()) {
+        // Weld vertex attributes.
+        weldVector4(d->colArray[set], xrefs);
+
+        // Remap vertex indices.
+        const uint vertexCount = d->vertexArray.count();
+        for (uint v = 0; v < vertexCount; v++)
+        {
+            Vertex & vertex = d->vertexArray[v];
+            if (vertex.col[set] != NIL) vertex.col[set] = xrefs[vertex.col[set]];
+        }
+    }
+
+    return d->colArray[set].count();
+}
+
+void MeshBuilder::weldVertices() {
+
+    if (d->vertexArray.count() == 0) {
+        // Nothing to do.
+        return;
+    }
+
+    Array<uint> xrefs;
+    Weld<Vertex> weldVertex;
+
+    // Weld vertices.
+    weldVertex(d->vertexArray, xrefs);
+
+    // Remap face indices.
+    const uint indexCount = d->indexArray.count();
+    for (uint i = 0; i < indexCount; i++)
+    {
+        d->indexArray[i] = xrefs[d->indexArray[i]];
+    }
+
+    // Remap vertex map.
+    foreach(i, d->vertexMap)
+    {
+        d->vertexMap[i].value = xrefs[d->vertexMap[i].value];
+    }
+}
+
+
+void MeshBuilder::optimize()
+{
+    if (d->vertexArray.count() == 0)
+    {
+        return;
+    }
+
+    weldPositions();
+    weldNormals();
+    weldTexCoords(0);
+    weldTexCoords(1);
+    weldColors();
+
+    weldVertices();
+}
+
+
+
+
+
+
+void MeshBuilder::removeUnusedMaterials(Array<uint> & newMaterialId)
+{
+    uint materialCount = d->materialArray.count();
+
+    // Reset face counts.
+    for (uint i = 0; i < materialCount; i++) {
+        d->materialArray[i].faceCount = 0;
+    }
+
+    // Count faces.
+    foreach(i, d->faceArray) {
+        Face & face = d->faceArray[i];
+
+        if (face.material != NIL) {
+            nvDebugCheck(face.material < materialCount);
+
+            d->materialArray[face.material].faceCount++;
+        }
+    }
+
+    // Remove unused materials.
+    newMaterialId.resize(materialCount);
+
+    for (uint i = 0, m = 0; i < materialCount; i++)
+    {
+        if (d->materialArray[m].faceCount > 0)
+        {
+            newMaterialId[i] = m++;
+        }
+        else
+        {
+            newMaterialId[i] = NIL;
+            d->materialArray.removeAt(m);
+        }
+    }
+
+    materialCount = d->materialArray.count();
+
+    // Update face material ids.
+    foreach(i, d->faceArray) {
+        Face & face = d->faceArray[i];
+
+        if (face.material != NIL) {
+            uint id = newMaterialId[face.material];
+            nvDebugCheck(id != NIL && id < materialCount);
+
+            face.material = id;
+        }
+    }
+}
+
+void MeshBuilder::sortFacesByGroup()
+{
+    const uint faceCount = d->faceArray.count();
+
+    Array<uint> faceGroupArray;
+    faceGroupArray.resize(faceCount);
+    
+    for (uint i = 0; i < faceCount; i++) {
+        faceGroupArray[i] = d->faceArray[i].group;
+    }
+
+    RadixSort radix;
+    radix.sort(faceGroupArray);
+
+    Array<Face> newFaceArray;
+    newFaceArray.resize(faceCount);
+
+    for (uint i = 0; i < faceCount; i++) {
+        newFaceArray[i] = d->faceArray[radix.rank(i)];
+    }
+
+    swap(newFaceArray, d->faceArray);
+}
+
+void MeshBuilder::sortFacesByMaterial()
+{
+    const uint faceCount = d->faceArray.count();
+
+    Array<uint> faceMaterialArray;
+    faceMaterialArray.resize(faceCount);
+    
+    for (uint i = 0; i < faceCount; i++) {
+        faceMaterialArray[i] = d->faceArray[i].material;
+    }
+
+    RadixSort radix;
+    radix.sort(faceMaterialArray);
+
+    Array<Face> newFaceArray;
+    newFaceArray.resize(faceCount);
+
+    for (uint i = 0; i < faceCount; i++) {
+        newFaceArray[i] = d->faceArray[radix.rank(i)];
+    }
+
+    swap(newFaceArray, d->faceArray);
+}
+
+
+void MeshBuilder::reset()
+{
+    nvDebugCheck(d != NULL);
+    delete d;
+    d = new PrivateData();
+}
+
+void MeshBuilder::done()
+{
+    if (d->currentGroup != NIL) {
+        endGroup();
+    }
+
+    if (d->currentMaterial != NIL) {
+        endMaterial();
+    }
+}
+
+// Hints.
+void MeshBuilder::hintTriangleCount(uint count)
+{
+    d->indexArray.reserve(d->indexArray.count() + count * 4);
+}
+
+void MeshBuilder::hintVertexCount(uint count)
+{
+    d->vertexArray.reserve(d->vertexArray.count() + count);
+    d->vertexMap.resize(d->vertexMap.count() + count);
+}
+
+void MeshBuilder::hintPositionCount(uint count)
+{
+    d->posArray.reserve(d->posArray.count() + count);
+}
+
+void MeshBuilder::hintNormalCount(uint count)
+{
+    d->norArray.reserve(d->norArray.count() + count);
+}
+
+void MeshBuilder::hintTexCoordCount(uint count, uint set/*=0*/)
+{
+    d->texArray[set].reserve(d->texArray[set].count() + count);
+}
+
+void MeshBuilder::hintColorCount(uint count, uint set/*=0*/)
+{
+    d->colArray[set].reserve(d->colArray[set].count() + count);
+}
+
+
+// Helpers.
+void MeshBuilder::addTriangle(uint v0, uint v1, uint v2)
+{
+    beginPolygon();
+    addVertex(v0);
+    addVertex(v1);
+    addVertex(v2);
+    endPolygon();
+}
+
+void MeshBuilder::addQuad(uint v0, uint v1, uint v2, uint v3)
+{
+    beginPolygon();
+    addVertex(v0);
+    addVertex(v1);
+    addVertex(v2);
+    addVertex(v3);
+    endPolygon();
+}
+
+
+// Get tri mesh.
+TriMesh * MeshBuilder::buildTriMesh() const
+{
+    const uint faceCount = d->faceArray.count();
+    uint triangleCount = 0;
+    for (uint f = 0; f < faceCount; f++) {
+        triangleCount += d->faceArray[f].indexCount - 2;
+    }
+    
+    const uint vertexCount = d->vertexArray.count();
+    TriMesh * mesh = new TriMesh(triangleCount, vertexCount);
+
+    // Build faces.
+    Array<TriMesh::Face> & faces = mesh->faces();
+
+    for(uint f = 0; f < faceCount; f++)
+    {
+        int firstIndex = d->faceArray[f].firstIndex;
+        int indexCount = d->faceArray[f].indexCount;
+
+        int v0 = d->indexArray[firstIndex + 0];
+        int v1 = d->indexArray[firstIndex + 1];
+
+        for(int t = 0; t < indexCount - 2; t++) {
+            int v2 = d->indexArray[firstIndex + t + 2];
+
+            TriMesh::Face face;
+            face.id = faces.count();
+            face.v[0] = v0;
+            face.v[1] = v1;
+            face.v[2] = v2;
+            faces.append(face);
+
+            v1 = v2;
+        }
+    }
+
+    // Build vertices.
+    Array<BaseMesh::Vertex> & vertices = mesh->vertices();
+
+    for(uint i = 0; i < vertexCount; i++)
+    {
+        BaseMesh::Vertex vertex;
+        vertex.id = i;
+        if (d->vertexArray[i].pos != NIL) vertex.pos = d->posArray[d->vertexArray[i].pos];
+        if (d->vertexArray[i].nor != NIL) vertex.nor = d->norArray[d->vertexArray[i].nor];
+        if (d->vertexArray[i].tex[0] != NIL) vertex.tex = d->texArray[0][d->vertexArray[i].tex[0]];
+
+        vertices.append(vertex);
+    }
+
+    return mesh;
+}
+
+// Get quad/tri mesh.
+QuadTriMesh * MeshBuilder::buildQuadTriMesh() const
+{
+    const uint faceCount = d->faceArray.count();
+    const uint vertexCount = d->vertexArray.count();
+    QuadTriMesh * mesh = new QuadTriMesh(faceCount, vertexCount);
+
+    // Build faces.
+    Array<QuadTriMesh::Face> & faces = mesh->faces();
+
+    for (uint f = 0; f < faceCount; f++) 
+    {
+        int firstIndex = d->faceArray[f].firstIndex;
+        int indexCount = d->faceArray[f].indexCount;
+
+        QuadTriMesh::Face face;
+        face.id = f;
+
+        face.v[0] = d->indexArray[firstIndex + 0];
+        face.v[1] = d->indexArray[firstIndex + 1];
+        face.v[2] = d->indexArray[firstIndex + 2];
+
+        // Only adds triangles and quads. Ignores polygons.
+        if (indexCount == 3) {
+            face.v[3] = NIL;
+            faces.append(face);
+        }
+        else if (indexCount == 4) {
+            face.v[3] = d->indexArray[firstIndex + 3];
+            faces.append(face);
+        }
+    }
+
+    // Build vertices.
+    Array<BaseMesh::Vertex> & vertices = mesh->vertices();
+
+    for(uint i = 0; i < vertexCount; i++)
+    {
+        BaseMesh::Vertex vertex;
+        vertex.id = i;
+        if (d->vertexArray[i].pos != NIL) vertex.pos = d->posArray[d->vertexArray[i].pos];
+        if (d->vertexArray[i].nor != NIL) vertex.nor = d->norArray[d->vertexArray[i].nor];
+        if (d->vertexArray[i].tex[0] != NIL) vertex.tex = d->texArray[0][d->vertexArray[i].tex[0]];
+
+        vertices.append(vertex);
+    }
+
+    return mesh;
+}
+
+// Get half edge mesh.
+HalfEdge::Mesh * MeshBuilder::buildHalfEdgeMesh(bool weldPositions, Error * error/*=NULL*/, Array<uint> * badFaces/*=NULL*/) const
+{
+    if (error != NULL) *error = Error_None;
+
+    const uint vertexCount = d->vertexArray.count();
+    AutoPtr<HalfEdge::Mesh> mesh(new HalfEdge::Mesh());
+
+    for(uint v = 0; v < vertexCount; v++)
+    {
+        HalfEdge::Vertex * vertex = mesh->addVertex(d->posArray[d->vertexArray[v].pos]);
+        if (d->vertexArray[v].nor != NIL) vertex->nor = d->norArray[d->vertexArray[v].nor];
+        if (d->vertexArray[v].tex[0] != NIL) vertex->tex = Vector2(d->texArray[0][d->vertexArray[v].tex[0]]);
+        if (d->vertexArray[v].col[0] != NIL) vertex->col = d->colArray[0][d->vertexArray[v].col[0]];
+    }
+
+    if (weldPositions) {
+        mesh->linkColocals();
+    }
+    else {
+        // Build canonical map from position indices.
+        Array<uint> canonicalMap(vertexCount);
+        
+        foreach (i, d->vertexArray) {
+            canonicalMap.append(d->vertexArray[i].pos);
+        }
+
+        mesh->linkColocalsWithCanonicalMap(canonicalMap);
+    }
+
+    const uint faceCount = d->faceArray.count();
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const uint firstIndex = d->faceArray[f].firstIndex;
+        const uint indexCount = d->faceArray[f].indexCount;
+
+        HalfEdge::Face * face = mesh->addFace(d->indexArray, firstIndex, indexCount);
+        
+        // @@ This is too late, removing the face here will leave the mesh improperly connected.
+        /*if (face->area() <= FLT_EPSILON) {
+            mesh->remove(face);
+            face = NULL;
+        }*/
+
+        if (face == NULL) {
+            // Non manifold mesh.
+            if (error != NULL) *error = Error_NonManifoldEdge;
+            if (badFaces != NULL) {
+                badFaces->append(d->faceArray[f].id);
+            }
+            //return NULL; // IC: Ignore error and continue building the mesh.
+        }
+
+        if (face != NULL) {
+            face->group = d->faceArray[f].group;
+            face->material = d->faceArray[f].material;
+        }
+    }
+
+    mesh->linkBoundary();
+
+    // We cannot fix functions here, because this would introduce new vertices and these vertices won't have the corresponding builder data.
+
+    // Maybe the builder should perform the search for T-junctions and update the vertex data directly.
+
+    // For now, we don't fix T-junctions at export time, but only during parameterization.
+
+    //mesh->fixBoundaryJunctions();
+
+    //mesh->sewBoundary();
+
+    return mesh.release();
+}
+
+
+bool MeshBuilder::buildPositions(Array<Vector3> & positionArray)
+{
+    const uint vertexCount = d->vertexArray.count();
+    positionArray.resize(vertexCount);
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        nvDebugCheck(d->vertexArray[v].pos != NIL);
+        positionArray[v] = d->posArray[d->vertexArray[v].pos];
+    }
+
+    return true;
+}
+
+bool MeshBuilder::buildNormals(Array<Vector3> & normalArray)
+{
+    bool anyNormal = false;
+
+    const uint vertexCount = d->vertexArray.count();
+    normalArray.resize(vertexCount);
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        if (d->vertexArray[v].nor == NIL) {
+            normalArray[v] = Vector3(0, 0, 1);
+        }
+        else {
+            anyNormal = true;
+            normalArray[v] = d->norArray[d->vertexArray[v].nor];
+        }
+    }
+
+    return anyNormal;
+}
+
+bool MeshBuilder::buildTexCoords(Array<Vector2> & texCoordArray, uint set/*=0*/)
+{
+    bool anyTexCoord = false;
+
+    const uint vertexCount = d->vertexArray.count();
+    texCoordArray.resize(vertexCount);
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        if (d->vertexArray[v].tex[set] == NIL) {
+            texCoordArray[v] = Vector2(0, 0);
+        }
+        else {
+            anyTexCoord = true;
+            texCoordArray[v] = d->texArray[set][d->vertexArray[v].tex[set]];
+        }
+    }
+
+    return anyTexCoord;
+}
+
+bool MeshBuilder::buildColors(Array<Vector4> & colorArray, uint set/*=0*/)
+{
+    bool anyColor = false;
+
+    const uint vertexCount = d->vertexArray.count();
+    colorArray.resize(vertexCount);
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        if (d->vertexArray[v].col[set] == NIL) {
+            colorArray[v] = Vector4(0, 0, 0, 1);
+        }
+        else {
+            anyColor = true;
+            colorArray[v] = d->colArray[set][d->vertexArray[v].col[set]];
+        }
+    }
+
+    return anyColor;
+}
+
+void MeshBuilder::buildVertexToPositionMap(Array<int> &map)
+{
+	const uint vertexCount = d->vertexArray.count();
+	map.resize(vertexCount);
+
+	foreach (i, d->vertexArray) {
+		map[i] = d->vertexArray[i].pos;
+	}
+}
+
+
+
+uint MeshBuilder::vertexCount() const
+{
+    return d->vertexArray.count();
+}
+
+
+uint MeshBuilder::positionCount() const
+{
+    return d->posArray.count();
+}
+
+uint MeshBuilder::normalCount() const
+{
+    return d->norArray.count();
+}
+
+uint MeshBuilder::texCoordCount(uint set/*=0*/) const
+{
+    return d->texArray[set].count();
+}
+
+uint MeshBuilder::colorCount(uint set/*=0*/) const
+{
+    return d->colArray[set].count();
+}
+
+
+uint MeshBuilder::materialCount() const
+{
+    return d->materialArray.count();
+}
+
+const char * MeshBuilder::material(uint i) const
+{
+    return d->materialArray[i].name;
+}
+
+
+uint MeshBuilder::positionIndex(uint vertex) const
+{
+    return d->vertexArray[vertex].pos;
+}
+uint MeshBuilder::normalIndex(uint vertex) const
+{
+    return d->vertexArray[vertex].nor;
+}
+uint MeshBuilder::texCoordIndex(uint vertex, uint set/*=0*/) const
+{
+    return d->vertexArray[vertex].tex[set];
+}
+uint MeshBuilder::colorIndex(uint vertex, uint set/*=0*/) const
+{
+    return d->vertexArray[vertex].col[set];
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/MeshBuilder.h b/thirdparty/thekla_atlas/nvmesh/MeshBuilder.h
new file mode 100644
index 0000000000..5b3af3fc1d
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/MeshBuilder.h
@@ -0,0 +1,119 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_MESHBUILDER_H
+#define NV_MESH_MESHBUILDER_H
+
+#include "nvmesh.h"
+#include "nvcore/Array.h"
+#include "nvmath/Vector.h"
+
+namespace nv
+{
+    class String;
+    class TriMesh;
+    class QuadTriMesh;
+    namespace HalfEdge { class Mesh; }
+
+
+    /// Mesh builder is a helper class for importers.
+    /// Ideally it should handle any vertex data, but for now it only accepts positions, 
+    /// normals and texcoords.
+    class MeshBuilder
+    {
+        NV_FORBID_COPY(MeshBuilder);
+        NV_FORBID_HEAPALLOC();
+    public:
+        MeshBuilder();
+        ~MeshBuilder();
+
+        // Builder methods.
+        uint addPosition(const Vector3 & v);
+        uint addNormal(const Vector3 & v);
+        uint addTexCoord(const Vector2 & v, uint set = 0);
+        uint addColor(const Vector4 & v, uint set = 0);
+
+        void beginGroup(uint id);
+        void endGroup();
+
+        uint addMaterial(const char * name);
+        void beginMaterial(uint id);
+        void endMaterial();
+
+        void beginPolygon(uint id = 0);
+        uint addVertex(uint p, uint n = NIL, uint t0 = NIL, uint t1 = NIL, uint c0 = NIL, uint c1 = NIL, uint c2 = NIL);
+        uint addVertex(const Vector3 & p);
+        //uint addVertex(const Vector3 & p, const Vector3 & n, const Vector2 & t0 = Vector2(0), const Vector2 & t1 = Vector2(0), const Vector4 & c0 = Vector4(0), const Vector4 & c1 = Vector4(0));
+        bool endPolygon();
+
+        uint weldPositions();
+        uint weldNormals();
+        uint weldTexCoords(uint set = 0);
+        uint weldColors(uint set = 0);
+        void weldVertices();
+
+        void optimize(); // eliminate duplicate components and duplicate vertices.
+        void removeUnusedMaterials(Array<uint> & newMaterialId);
+        void sortFacesByGroup();
+        void sortFacesByMaterial();
+
+        void done();
+        void reset();
+
+        // Hints.
+        void hintTriangleCount(uint count);
+        void hintVertexCount(uint count);
+        void hintPositionCount(uint count);
+        void hintNormalCount(uint count);
+        void hintTexCoordCount(uint count, uint set = 0);
+        void hintColorCount(uint count, uint set = 0);
+
+        // Helpers.
+        void addTriangle(uint v0, uint v1, uint v2);
+        void addQuad(uint v0, uint v1, uint v2, uint v3);
+
+        // Get result.
+        TriMesh * buildTriMesh() const;
+        QuadTriMesh * buildQuadTriMesh() const;
+
+        enum Error {
+            Error_None,
+            Error_NonManifoldEdge,
+            Error_NonManifoldVertex,
+        };
+
+        HalfEdge::Mesh * buildHalfEdgeMesh(bool weldPositions, Error * error = NULL, Array<uint> * badFaces = NULL) const;
+
+        bool buildPositions(Array<Vector3> & positionArray);
+        bool buildNormals(Array<Vector3> & normalArray);
+        bool buildTexCoords(Array<Vector2> & texCoordArray, uint set = 0);
+        bool buildColors(Array<Vector4> & colorArray, uint set = 0);
+		void buildVertexToPositionMap(Array<int> & map);
+
+
+        // Expose attribute indices of the unified vertex array.
+        uint vertexCount() const;
+        
+        uint positionCount() const;
+        uint normalCount() const;
+        uint texCoordCount(uint set = 0) const;
+        uint colorCount(uint set = 0) const;
+
+        uint materialCount() const;
+        const char * material(uint i) const;
+
+        uint positionIndex(uint vertex) const;
+        uint normalIndex(uint vertex) const;
+        uint texCoordIndex(uint vertex, uint set = 0) const;
+        uint colorIndex(uint vertex, uint set = 0) const;
+
+    private:
+
+        struct PrivateData;
+        PrivateData * d;
+
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_MESHBUILDER_H
diff --git a/thirdparty/thekla_atlas/nvmesh/MeshTopology.cpp b/thirdparty/thekla_atlas/nvmesh/MeshTopology.cpp
new file mode 100644
index 0000000000..e7e1dce421
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/MeshTopology.cpp
@@ -0,0 +1,122 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "nvmesh.h" // pch
+
+#include "nvcore/Array.h"
+#include "nvcore/BitArray.h"
+
+#include "nvmesh/MeshTopology.h"
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Edge.h"
+#include "nvmesh/halfedge/Face.h"
+
+using namespace nv;
+
+void MeshTopology::buildTopologyInfo(const HalfEdge::Mesh * mesh)
+{
+    const uint vertexCount = mesh->colocalVertexCount();
+    const uint faceCount = mesh->faceCount();
+    const uint edgeCount = mesh->edgeCount();
+
+    nvDebug( "--- Building mesh topology:\n" );
+
+    Array<uint> stack(faceCount);
+
+    BitArray bitFlags(faceCount);
+    bitFlags.clearAll();
+
+    // Compute connectivity.
+    nvDebug( "---   Computing connectivity.\n" );
+
+    m_connectedCount = 0;
+
+    for(uint f = 0; f < faceCount; f++ ) {
+        if( bitFlags.bitAt(f) == false ) {
+            m_connectedCount++;
+
+            stack.pushBack( f );
+            while( !stack.isEmpty() ) {
+
+                const uint top = stack.back();
+                nvCheck(top != NIL);
+                stack.popBack();
+
+                if( bitFlags.bitAt(top) == false ) {
+                    bitFlags.setBitAt(top);
+
+                    const HalfEdge::Face * face = mesh->faceAt(top);
+                    const HalfEdge::Edge * firstEdge = face->edge;
+                    const HalfEdge::Edge * edge = firstEdge;
+
+                    do {
+                        const HalfEdge::Face * neighborFace = edge->pair->face;
+                        if (neighborFace != NULL) {
+                            stack.pushBack(neighborFace->id);
+                        }
+                        edge = edge->next;
+                    } while(edge != firstEdge);
+                }
+            }
+        }
+    }
+    nvCheck(stack.isEmpty());
+    nvDebug( "---   %d connected components.\n", m_connectedCount );
+
+
+    // Count boundary loops.
+    nvDebug( "---   Counting boundary loops.\n" );
+    m_boundaryCount = 0;
+
+    bitFlags.resize(edgeCount);
+    bitFlags.clearAll();
+
+    // Don't forget to link the boundary otherwise this won't work.
+    for (uint e = 0; e < edgeCount; e++)
+    {
+        const HalfEdge::Edge * startEdge = mesh->edgeAt(e);
+        if (startEdge != NULL && startEdge->isBoundary() && bitFlags.bitAt(e) == false)
+        {
+            nvDebugCheck(startEdge->face != NULL);
+            nvDebugCheck(startEdge->pair->face == NULL);
+
+            startEdge = startEdge->pair;
+
+            m_boundaryCount++;
+
+            const HalfEdge::Edge * edge = startEdge;
+            do {
+                bitFlags.setBitAt(edge->id / 2);
+                edge = edge->next;
+            } while(startEdge != edge);
+        }
+    }
+    nvDebug("---   %d boundary loops found.\n", m_boundaryCount );
+
+
+    // Compute euler number.
+    m_eulerNumber = vertexCount - edgeCount + faceCount;
+    nvDebug("---   Euler number: %d.\n", m_eulerNumber);
+
+
+    // Compute genus. (only valid on closed connected surfaces)
+    m_genus = -1;
+    if( isClosed() && isConnected() ) {
+        m_genus = (2 - m_eulerNumber) / 2;
+        nvDebug("---   Genus: %d.\n", m_genus);
+    }
+}
+
+
+/*static*/ bool MeshTopology::isQuadOnly(const HalfEdge::Mesh * mesh)
+{
+    const uint faceCount = mesh->faceCount();
+    for(uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = mesh->faceAt(f);
+        if (face->edgeCount() != 4) {
+            return false;
+        }
+    }
+
+    return true;
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/MeshTopology.h b/thirdparty/thekla_atlas/nvmesh/MeshTopology.h
new file mode 100644
index 0000000000..c3d7477b15
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/MeshTopology.h
@@ -0,0 +1,66 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_MESHTOPOLOGY_H
+#define NV_MESH_MESHTOPOLOGY_H
+
+#include <nvmesh/nvmesh.h>
+
+namespace nv
+{
+    namespace HalfEdge { class Mesh; }
+    class MeshAdjacency;
+
+    /// Mesh topology information.
+    class MeshTopology
+    {
+    public:
+        MeshTopology(const HalfEdge::Mesh * mesh) { buildTopologyInfo(mesh); }
+
+        /// Determine if the mesh is connected.
+        bool isConnected() const { return m_connectedCount == 1; }
+
+        /// Determine if the mesh is closed. (Each edge is shared by two faces)
+        bool isClosed() const { return m_boundaryCount == 0; }
+
+        /// Return true if the mesh has the topology of a disk.
+        bool isDisk() const { return isConnected() && m_boundaryCount == 1/* && m_eulerNumber == 1*/; }
+
+        /// Return the number of connected components.
+        int connectedCount() const { return m_connectedCount; }
+
+        /// Return the number of open holes.
+        int holeCount() const { return m_boundaryCount; }
+
+        /// Return the genus of the mesh.
+        int genus() const { return m_genus; }
+
+        /// Return the euler number of the mesh.
+        int euler() const { return m_eulerNumber; }
+
+
+        static bool isQuadOnly(const HalfEdge::Mesh * mesh);
+
+
+    private:
+
+        NVMESH_API void buildTopologyInfo(const HalfEdge::Mesh * mesh);
+
+    private:
+
+        ///< Number of boundary loops.
+        int m_boundaryCount;		
+
+        ///< Number of connected components.
+        int m_connectedCount;		
+
+        ///< Euler number.
+        int m_eulerNumber;
+
+        /// Mesh genus.
+        int m_genus;
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_MESHTOPOLOGY_H
diff --git a/thirdparty/thekla_atlas/nvmesh/QuadTriMesh.cpp b/thirdparty/thekla_atlas/nvmesh/QuadTriMesh.cpp
new file mode 100644
index 0000000000..64a071abe9
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/QuadTriMesh.cpp
@@ -0,0 +1,36 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "QuadTriMesh.h"
+#include "Stream.h"
+
+using namespace nv;
+
+
+bool QuadTriMesh::isQuadFace(uint i) const 
+{ 
+    return m_faceArray[i].isQuadFace();
+}
+
+const QuadTriMesh::Vertex & QuadTriMesh::faceVertex(uint f, uint v) const 
+{
+    if (isQuadFace(f)) nvDebugCheck(v < 4);
+    else nvDebugCheck(v < 3);
+
+    const Face & face = this->faceAt(f);
+    return this->vertexAt(face.v[v]);
+}
+
+
+namespace nv
+{
+    static Stream & operator<< (Stream & s, QuadTriMesh::Face & face)
+    {
+        return s << face.id << face.v[0] << face.v[1] << face.v[2] << face.v[3];
+    }
+
+    Stream & operator<< (Stream & s, QuadTriMesh & mesh)
+    {
+        return s << mesh.m_faceArray << (BaseMesh &) mesh;
+    }
+}
+
diff --git a/thirdparty/thekla_atlas/nvmesh/QuadTriMesh.h b/thirdparty/thekla_atlas/nvmesh/QuadTriMesh.h
new file mode 100644
index 0000000000..b8465f2db0
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/QuadTriMesh.h
@@ -0,0 +1,60 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MESH_QUADTRIMESH_H
+#define NV_MESH_QUADTRIMESH_H
+
+#include "nvcore/Array.h"
+#include "nvmath/Vector.h"
+#include "nvmesh/nvmesh.h"
+#include "nvmesh/BaseMesh.h"
+
+namespace nv
+{
+    class Stream;
+
+    /// Mixed quad/triangle mesh.
+    class QuadTriMesh : public BaseMesh
+    {
+    public:
+        struct Face;
+        typedef BaseMesh::Vertex Vertex;
+
+        QuadTriMesh() {};
+        QuadTriMesh(uint faceCount, uint vertexCount) : BaseMesh(vertexCount), m_faceArray(faceCount) {}
+
+        // Face methods.
+        uint faceCount() const { return m_faceArray.count(); }
+
+        const Face & faceAt(uint i) const { return m_faceArray[i]; }
+        Face & faceAt(uint i) { return m_faceArray[i]; }
+
+        const Array<Face> & faces() const { return m_faceArray; }
+        Array<Face> & faces() { return m_faceArray; }
+
+        bool isQuadFace(uint i) const;
+
+        const Vertex & faceVertex(uint f, uint v) const;
+
+        friend Stream & operator<< (Stream & s, QuadTriMesh & obj);
+
+    private:
+
+        Array<Face> m_faceArray;
+
+    };
+
+
+    /// QuadTriMesh face.
+    struct QuadTriMesh::Face
+    {
+        uint id;
+        uint v[4];
+
+        bool isQuadFace() const { return v[3] != NIL; }
+    };
+
+} // nv namespace
+
+
+#endif // NV_MESH_QUADTRIMESH_H
diff --git a/thirdparty/thekla_atlas/nvmesh/TriMesh.cpp b/thirdparty/thekla_atlas/nvmesh/TriMesh.cpp
new file mode 100644
index 0000000000..bf10a474fb
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/TriMesh.cpp
@@ -0,0 +1,25 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "TriMesh.h"
+
+using namespace nv;
+
+
+/// Triangle mesh.
+Vector3 TriMesh::faceNormal(uint f) const
+{
+    const Face & face = this->faceAt(f);
+    const Vector3 & p0 = this->vertexAt(face.v[0]).pos;
+    const Vector3 & p1 = this->vertexAt(face.v[1]).pos;
+    const Vector3 & p2 = this->vertexAt(face.v[2]).pos;
+    return normalizeSafe(cross(p1 - p0, p2 - p0), Vector3(0.0f), 0.0f);
+}
+
+/// Get face vertex.
+const TriMesh::Vertex & TriMesh::faceVertex(uint f, uint v) const
+{
+    nvDebugCheck(v < 3);
+    const Face & face = this->faceAt(f);
+    return this->vertexAt(face.v[v]);
+}
+
diff --git a/thirdparty/thekla_atlas/nvmesh/TriMesh.h b/thirdparty/thekla_atlas/nvmesh/TriMesh.h
new file mode 100644
index 0000000000..bc5672c1ac
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/TriMesh.h
@@ -0,0 +1,51 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MESH_TRIMESH_H
+#define NV_MESH_TRIMESH_H
+
+#include "nvcore/Array.h"
+#include "nvmath/Vector.inl"
+#include "nvmesh/nvmesh.h"
+#include "nvmesh/BaseMesh.h"
+
+namespace nv
+{
+    /// Triangle mesh.
+    class TriMesh : public BaseMesh
+    {
+    public:
+        struct Face;
+        typedef BaseMesh::Vertex Vertex;
+
+        TriMesh(uint faceCount, uint vertexCount) : BaseMesh(vertexCount), m_faceArray(faceCount) {}
+
+        // Face methods.
+        uint faceCount() const { return m_faceArray.count(); }
+        const Face & faceAt(uint i) const { return m_faceArray[i]; }
+        Face & faceAt(uint i) { return m_faceArray[i]; }
+        const Array<Face> & faces() const { return m_faceArray; }
+        Array<Face> & faces() { return m_faceArray; }
+
+        NVMESH_API Vector3 faceNormal(uint f) const;
+        NVMESH_API const Vertex & faceVertex(uint f, uint v) const;
+
+        friend Stream & operator<< (Stream & s, BaseMesh & obj);
+
+    private:
+
+        Array<Face> m_faceArray;
+
+    };
+
+
+    /// TriMesh face.
+    struct TriMesh::Face
+    {
+        uint id;
+        uint v[3];
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_TRIMESH_H
diff --git a/thirdparty/thekla_atlas/nvmesh/geometry/Bounds.cpp b/thirdparty/thekla_atlas/nvmesh/geometry/Bounds.cpp
new file mode 100644
index 0000000000..69fd1deb24
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/geometry/Bounds.cpp
@@ -0,0 +1,54 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "nvmesh.h" // pch
+
+#include "Bounds.h"
+
+#include "nvmesh/BaseMesh.h"
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Vertex.h"
+
+#include "nvmath/Box.inl"
+
+using namespace nv;
+
+Box MeshBounds::box(const BaseMesh * mesh)
+{
+    nvCheck(mesh != NULL);
+
+    Box bounds;
+    bounds.clearBounds();
+
+    const uint vertexCount = mesh->vertexCount();
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        const BaseMesh::Vertex & vertex = mesh->vertexAt(v);
+        bounds.addPointToBounds( vertex.pos );
+    }
+
+    return bounds;
+}
+
+Box MeshBounds::box(const HalfEdge::Mesh * mesh)
+{
+    nvCheck(mesh != NULL);
+
+    Box bounds;
+    bounds.clearBounds();
+
+    const uint vertexCount = mesh->vertexCount();
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        const HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+        nvDebugCheck(vertex != NULL);
+        bounds.addPointToBounds( vertex->pos );
+    }
+
+    return bounds;
+}
+
+/*Sphere MeshBounds::sphere(const HalfEdge::Mesh * mesh)
+{
+    // @@ TODO
+    return Sphere();
+}*/
diff --git a/thirdparty/thekla_atlas/nvmesh/geometry/Bounds.h b/thirdparty/thekla_atlas/nvmesh/geometry/Bounds.h
new file mode 100644
index 0000000000..1cb5b7b905
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/geometry/Bounds.h
@@ -0,0 +1,28 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MESH_MESHBOUNDS_H
+#define NV_MESH_MESHBOUNDS_H
+
+#include <nvmath/Sphere.h>
+#include <nvmath/Box.h>
+
+#include <nvmesh/nvmesh.h>
+
+namespace nv
+{
+    class BaseMesh;
+    namespace HalfEdge { class Mesh; }
+
+    // Bounding volumes computation.
+    namespace MeshBounds
+    {
+        Box box(const BaseMesh * mesh);
+        Box box(const HalfEdge::Mesh * mesh);
+
+        Sphere sphere(const HalfEdge::Mesh * mesh);
+    }
+
+} // nv namespace
+
+#endif // NV_MESH_MESHBOUNDS_H
diff --git a/thirdparty/thekla_atlas/nvmesh/geometry/Measurements.cpp b/thirdparty/thekla_atlas/nvmesh/geometry/Measurements.cpp
new file mode 100644
index 0000000000..e0c271663b
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/geometry/Measurements.cpp
@@ -0,0 +1,36 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "nvmesh.h" // pch
+
+#include "Measurements.h"
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Face.h"
+
+using namespace nv;
+
+float nv::computeSurfaceArea(const HalfEdge::Mesh * mesh)
+{
+    float area = 0;
+
+    for (HalfEdge::Mesh::ConstFaceIterator it(mesh->faces()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Face * face = it.current();
+        area += face->area();
+    }
+    nvDebugCheck(area >= 0);
+
+    return area;
+}
+
+float nv::computeParametricArea(const HalfEdge::Mesh * mesh)
+{
+    float area = 0;
+
+    for (HalfEdge::Mesh::ConstFaceIterator it(mesh->faces()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Face * face = it.current();
+        area += face->parametricArea();
+    }
+
+    return area;
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/geometry/Measurements.h b/thirdparty/thekla_atlas/nvmesh/geometry/Measurements.h
new file mode 100644
index 0000000000..0be863b79e
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/geometry/Measurements.h
@@ -0,0 +1,18 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MESH_MESHMEASUREMENTS_H
+#define NV_MESH_MESHMEASUREMENTS_H
+
+#include "nvmesh/nvmesh.h"
+
+namespace nv
+{
+    namespace HalfEdge { class Mesh; }
+
+	float computeSurfaceArea(const HalfEdge::Mesh * mesh);
+	float computeParametricArea(const HalfEdge::Mesh * mesh);
+
+} // nv namespace
+
+#endif // NV_MESH_MESHMEASUREMENTS_H
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Edge.cpp b/thirdparty/thekla_atlas/nvmesh/halfedge/Edge.cpp
new file mode 100644
index 0000000000..671650296c
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Edge.cpp
@@ -0,0 +1,57 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "nvmesh.h" // pch
+
+#include "Edge.h"
+#include "Vertex.h"
+
+#include "nvmath/Vector.inl"
+
+using namespace nv;
+using namespace HalfEdge;
+
+Vector3 Edge::midPoint() const
+{
+    return (to()->pos + from()->pos) * 0.5f;
+}
+
+float Edge::length() const
+{
+    return ::length(to()->pos - from()->pos); 
+}
+
+// Return angle between this edge and the previous one.
+float Edge::angle() const {
+    Vector3 p = vertex->pos;
+    Vector3 a = prev->vertex->pos;
+    Vector3 b = next->vertex->pos;
+
+    Vector3 v0 = a - p;
+    Vector3 v1 = b - p;
+
+    return acosf(dot(v0, v1) / (nv::length(v0) * nv::length(v1)));
+}
+
+bool Edge::isValid() const
+{
+    // null face is OK.
+    if (next == NULL || prev == NULL || pair == NULL || vertex == NULL) return false;
+    if (next->prev != this) return false;
+    if (prev->next != this) return false;
+    if (pair->pair != this) return false;
+    return true;
+}
+
+/*
+Edge * Edge::nextBoundary() {
+    nvDebugCheck(this->m_pair == NULL);
+
+}
+
+Edge * Edge::prevBoundary() {
+    nvDebugCheck(this->m_pair == NULL);
+
+}
+*/
+
+
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Edge.h b/thirdparty/thekla_atlas/nvmesh/halfedge/Edge.h
new file mode 100644
index 0000000000..25c47f4860
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Edge.h
@@ -0,0 +1,70 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_HALFEDGE_EDGE_H
+#define NV_MESH_HALFEDGE_EDGE_H
+
+#include "nvmath/Vector.h"
+
+namespace nv
+{
+    namespace HalfEdge { class Vertex; class Face; class Edge; }
+
+    /// Half edge edge. 
+    class HalfEdge::Edge
+    {
+        NV_FORBID_COPY(Edge);
+    public:
+
+        uint id;
+
+        Edge * next;
+        Edge * prev;	// This is not strictly half-edge, but makes algorithms easier and faster.
+        Edge * pair;
+        Vertex * vertex;
+        Face * face;
+
+
+        // Default constructor.
+        Edge(uint id) : id(id), next(NULL), prev(NULL), pair(NULL), vertex(NULL), face(NULL)
+        {
+        }
+
+
+        // Vertex queries.
+        const Vertex * from() const { return vertex; }
+        Vertex * from() { return vertex; }
+
+        const Vertex * to() const { return pair->vertex; }  // This used to be 'next->vertex', but that changed often when the connectivity of the mesh changes.
+        Vertex * to() { return pair->vertex; }
+
+
+        // Edge queries.
+        void setNext(Edge * e) { next = e; if (e != NULL) e->prev = this; }
+        void setPrev(Edge * e) { prev = e; if (e != NULL) e->next = this; }
+
+        // @@ Add these helpers:
+        //Edge * nextBoundary();
+        //Edge * prevBoundary();
+
+
+        // @@ It would be more simple to only check m_pair == NULL
+        // Face queries.
+        bool isBoundary() const { return !(face && pair->face); }
+
+        // @@ This is not exactly accurate, we should compare the texture coordinates...
+        bool isSeam() const { return vertex != pair->next->vertex || next->vertex != pair->vertex; }
+
+        bool isValid() const;
+
+        // Geometric queries.
+        Vector3 midPoint() const;
+        float length() const;
+        float angle() const;
+
+    };
+
+} // nv namespace
+
+
+#endif // NV_MESH_HALFEDGE_EDGE_H
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Face.cpp b/thirdparty/thekla_atlas/nvmesh/halfedge/Face.cpp
new file mode 100644
index 0000000000..9f6987154e
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Face.cpp
@@ -0,0 +1,268 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "nvmesh.h" // pch
+
+#include "Face.h"
+#include "Vertex.h"
+
+#include "nvmath/Fitting.h"
+#include "nvmath/Plane.h"
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Array.h"
+
+
+using namespace nv;
+using namespace HalfEdge;
+
+/// Get face area.
+float Face::area() const
+{
+    float area = 0;
+    const Vector3 & v0 = edge->from()->pos;
+
+    for (ConstEdgeIterator it(edges(edge->next)); it.current() != edge->prev; it.advance())
+    {
+        const Edge * e = it.current();
+
+        const Vector3 & v1 = e->vertex->pos;
+        const Vector3 & v2 = e->next->vertex->pos; 
+
+        area += length(cross(v1-v0, v2-v0));
+    }
+
+    return area * 0.5f;
+}
+
+float Face::parametricArea() const
+{
+    float area = 0;
+    const Vector2 & v0 = edge->from()->tex;
+
+    for (ConstEdgeIterator it(edges(edge->next)); it.current() != edge->prev; it.advance())
+    {
+        const Edge * e = it.current();
+
+        const Vector2 & v1 = e->vertex->tex;
+        const Vector2 & v2 = e->next->vertex->tex;
+
+        area += triangleArea(v0, v1, v2);
+    }
+
+    return area * 0.5f;
+}
+
+
+/// Get boundary length.
+float Face::boundaryLength() const
+{
+    float bl = 0;
+
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        const Edge * edge = it.current();
+        bl += edge->length();
+    }
+
+    return bl;
+}
+
+
+/// Get face normal.
+Vector3 Face::normal() const
+{
+    Vector3 n(0);
+
+    const Vertex * vertex0 = NULL;
+
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        const Edge * edge = it.current();
+        nvCheck(edge != NULL);
+
+        if (vertex0 == NULL)
+        {
+            vertex0 = edge->vertex;
+        }
+        else if (edge->next->vertex != vertex0)
+        {
+            const HalfEdge::Vertex * vertex1 = edge->from();
+            const HalfEdge::Vertex * vertex2 = edge->to();
+
+            const Vector3 & p0 = vertex0->pos;
+            const Vector3 & p1 = vertex1->pos;
+            const Vector3 & p2 = vertex2->pos;
+
+            Vector3 v10 = p1 - p0;
+            Vector3 v20 = p2 - p0;
+
+            n += cross(v10, v20);
+        }
+    }
+
+    return normalizeSafe(n, Vector3(0, 0, 1), 0.0f);
+
+
+    // Get face points eliminating duplicates.
+    /*Array<Vector3> points(4);
+
+    points.append(m_edge->prev()->from()->pos);
+
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        const Edge * edge = it.current();
+        nvDebugCheck(edge != NULL);
+
+        const Vector3 & p = edge->from()->pos;
+        if (points.back() != p)
+        {
+            points.append(edge->from()->pos);
+        }
+    }
+
+    points.popBack();
+
+    if (points.count() < 3)
+    {
+        // Invalid normal.
+        return Vector3(0.0f);
+    }
+    else
+    {
+        // Compute regular normal.
+        Vector3 normal = normalizeSafe(cross(points[1] - points[0], points[2] - points[0]), Vector3(0.0f), 0.0f);
+
+#pragma NV_MESSAGE("TODO: make sure these three points are not colinear")
+
+        if (points.count() > 3)
+        {
+            // Compute best fitting plane to the points.
+            Plane plane = Fit::bestPlane(points.count(), points.buffer());
+
+            // Adjust normal orientation.
+            if (dot(normal, plane.vector()) > 0) {
+                normal = plane.vector();
+            }
+            else {
+                normal = -plane.vector();
+            }
+        }
+
+        nvDebugCheck(isNormalized(normal));
+        return normal;
+    }*/
+}
+
+Vector3 Face::centroid() const
+{
+    Vector3 sum(0.0f);
+    uint count = 0;
+
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        const Edge * edge = it.current();
+        sum += edge->from()->pos;
+        count++;
+    }
+
+    return sum / float(count);
+}
+
+
+bool Face::isValid() const
+{
+    uint count = 0;
+
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        const Edge * edge = it.current();
+        if (edge->face != this) return false;
+        if (!edge->isValid()) return false;
+        if (!edge->pair->isValid()) return false;
+        count++;
+    }
+
+    if (count < 3) return false;
+
+    return true;
+}
+
+
+// Determine if this face contains the given edge.
+bool Face::contains(const Edge * e) const
+{
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        if(it.current() == e) return true;
+    }
+    return false;
+}
+
+// Returns index in this face of the given edge.
+uint Face::edgeIndex(const Edge * e) const
+{
+    int i = 0;
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance(), i++)
+    {
+        if(it.current() == e) return i;
+    }
+    return NIL;
+}
+
+
+Edge * Face::edgeAt(uint idx)
+{ 
+    int i = 0;
+    for(EdgeIterator it(edges()); !it.isDone(); it.advance(), i++) {
+        if (i == idx) return it.current();
+    }
+    return NULL;
+}
+const Edge * Face::edgeAt(uint idx) const 
+{
+    int i = 0;
+    for(ConstEdgeIterator it(edges()); !it.isDone(); it.advance(), i++) {
+        if (i == idx) return it.current();
+    }
+    return NULL;
+}
+
+
+// Count the number of edges in this face.
+uint Face::edgeCount() const
+{
+    uint count = 0;
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance()) { ++count; }
+    return count;
+}
+
+// Determine if this is a boundary face.
+bool Face::isBoundary() const
+{
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        const Edge * edge = it.current();
+        nvDebugCheck(edge->pair != NULL);
+
+        if (edge->pair->face == NULL) {
+            return true;
+        }
+    }
+    return false;
+}
+
+// Count the number of boundary edges in the face.
+uint Face::boundaryCount() const
+{
+    uint count = 0;
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance())
+    {
+        const Edge * edge = it.current();
+        nvDebugCheck(edge->pair != NULL);
+
+        if (edge->pair->face == NULL) {
+            count++;
+        }
+    }
+    return count;
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Face.h b/thirdparty/thekla_atlas/nvmesh/halfedge/Face.h
new file mode 100644
index 0000000000..677f8666f0
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Face.h
@@ -0,0 +1,106 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_HALFEDGE_FACE_H
+#define NV_MESH_HALFEDGE_FACE_H
+
+#include <nvmesh/halfedge/Edge.h>
+
+namespace nv
+{
+    namespace HalfEdge { class Vertex; class Face; class Edge; }
+
+    /// Face of a half-edge mesh.
+    class HalfEdge::Face
+    {
+        NV_FORBID_COPY(Face);
+    public:
+
+        uint id;
+        uint16 group;
+        uint16 material;
+        Edge * edge;
+
+
+        Face(uint id) : id(id), group(~0), material(~0), edge(NULL) {}
+
+        float area() const;
+        float parametricArea() const;
+        float boundaryLength() const;
+        Vector3 normal() const;
+        Vector3 centroid() const;
+
+        bool isValid() const;
+
+        bool contains(const Edge * e) const;
+        uint edgeIndex(const Edge * e) const;
+        
+        Edge * edgeAt(uint idx);
+        const Edge * edgeAt(uint idx) const;
+
+        uint edgeCount() const;
+        bool isBoundary() const;
+        uint boundaryCount() const;
+
+
+        // The iterator that visits the edges of this face in clockwise order.
+        class EdgeIterator //: public Iterator<Edge *>
+        {
+        public:
+            EdgeIterator(Edge * e) : m_end(NULL), m_current(e) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->next;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual Edge * current() const { return m_current; }
+            Vertex * vertex() const { return m_current->vertex; }
+
+        private:
+            Edge * m_end;
+            Edge * m_current;
+        };
+
+        EdgeIterator edges() { return EdgeIterator(edge); }
+        EdgeIterator edges(Edge * e)
+        { 
+            nvDebugCheck(contains(e));
+            return EdgeIterator(e); 
+        }
+
+        // The iterator that visits the edges of this face in clockwise order.
+        class ConstEdgeIterator //: public Iterator<const Edge *>
+        {
+        public:
+            ConstEdgeIterator(const Edge * e) : m_end(NULL), m_current(e) { }
+            ConstEdgeIterator(const EdgeIterator & it) : m_end(NULL), m_current(it.current()) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->next;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual const Edge * current() const { return m_current; }
+            const Vertex * vertex() const { return m_current->vertex; }
+
+        private:
+            const Edge * m_end;
+            const Edge * m_current;
+        };
+
+        ConstEdgeIterator edges() const { return ConstEdgeIterator(edge); }
+        ConstEdgeIterator edges(const Edge * e) const
+        { 
+            nvDebugCheck(contains(e));
+            return ConstEdgeIterator(e); 
+        }
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_HALFEDGE_FACE_H
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Mesh.cpp b/thirdparty/thekla_atlas/nvmesh/halfedge/Mesh.cpp
new file mode 100644
index 0000000000..0012513bce
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Mesh.cpp
@@ -0,0 +1,1284 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "nvmesh.h" // pch
+
+#include "Mesh.h"
+#include "Edge.h"
+#include "Vertex.h"
+#include "Face.h"
+
+#include "nvmesh/TriMesh.h"
+#include "nvmesh/QuadTriMesh.h"
+#include "nvmesh/MeshBuilder.h"
+
+#include "nvmath/Vector.inl"
+#include "nvcore/Array.inl"
+#include "nvcore/HashMap.inl"
+
+
+using namespace nv;
+using namespace HalfEdge;
+
+Mesh::Mesh() : m_colocalVertexCount(0)
+{
+    errorCount = 0;
+}
+
+Mesh::Mesh(const Mesh * mesh)
+{
+    errorCount = 0;
+
+    // Copy mesh vertices.
+    const uint vertexCount = mesh->vertexCount();
+    m_vertexArray.resize(vertexCount);
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        const Vertex * vertex = mesh->vertexAt(v);
+        nvDebugCheck(vertex->id == v);
+
+        m_vertexArray[v] = new Vertex(v);
+        m_vertexArray[v]->pos = vertex->pos;
+        m_vertexArray[v]->nor = vertex->nor;
+        m_vertexArray[v]->tex = vertex->tex;
+    }
+
+    m_colocalVertexCount = vertexCount;
+
+
+    // Copy mesh faces.
+    const uint faceCount = mesh->faceCount();
+
+    Array<uint> indexArray(3);
+
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const Face * face = mesh->faceAt(f);
+
+        for(Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance()) {
+            const Vertex * vertex = it.current()->from();
+            indexArray.append(vertex->id);
+        }
+
+        addFace(indexArray);
+        indexArray.clear();
+    }
+}
+
+Mesh::~Mesh()
+{
+    clear();
+}
+
+
+void Mesh::clear()
+{
+    deleteAll(m_vertexArray); 
+    m_vertexArray.clear();
+
+    foreach(i, m_edgeMap)
+    {
+        delete m_edgeMap[i].value;
+    }
+    //deleteAll(m_edgeArray);	// edgeArray only contains 1/2 of the edges!
+    m_edgeArray.clear();
+    m_edgeMap.clear();
+
+    deleteAll(m_faceArray);
+    m_faceArray.clear();
+}
+
+
+Vertex * Mesh::addVertex(const Vector3 & pos)
+{
+    nvDebugCheck(isFinite(pos));
+
+    Vertex * v = new Vertex(m_vertexArray.count());
+    v->pos = pos;
+    m_vertexArray.append(v);
+
+    return v;
+
+//    return addVertex(m_vertexArray.count(), pos);
+}
+
+/*Vertex * Mesh::addVertex(uint id, const Vector3 & pos)
+{
+    nvDebugCheck(isFinite(pos));
+
+    Vertex * v = new Vertex(id);
+    v->pos = pos;
+    m_vertexArray.append(v);
+
+    return v;
+}*/
+
+/*void Mesh::addVertices(const Mesh * mesh)
+{
+nvCheck(mesh != NULL);
+
+// Add mesh vertices
+for (uint v = 0; v < vertexCount; v++)
+{
+const Vertex * vertex = mesh->vertexAt(v);
+nvDebugCheck(vertex != NULL);
+
+Vertex * v = addVertex(vertex->pos());
+nvDebugCheck(v != NULL);
+
+v->setNor(vertex->nor());
+v->setTex(vertex->tex());
+}
+}*/
+
+
+/// Link colocal vertices based on geometric location only.
+void Mesh::linkColocals()
+{
+    nvDebug("--- Linking colocals:\n");
+
+    const uint vertexCount = this->vertexCount();
+    HashMap<Vector3, Vertex *> vertexMap(vertexCount);
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        Vertex * vertex = vertexAt(v);
+
+        Vertex * colocal;
+        if (vertexMap.get(vertex->pos, &colocal))
+        {
+            colocal->linkColocal(vertex);
+        }
+        else
+        {
+            vertexMap.add(vertex->pos, vertex);
+        }
+    }
+
+    m_colocalVertexCount = vertexMap.count();
+
+    nvDebug("---   %d vertex positions.\n", m_colocalVertexCount);
+
+    // @@ Remove duplicated vertices? or just leave them as colocals?
+}
+
+void Mesh::linkColocalsWithCanonicalMap(const Array<uint> & canonicalMap)
+{
+    nvDebug("--- Linking colocals:\n");
+
+    uint vertexMapSize = 0;
+    foreach(i, canonicalMap) {
+        vertexMapSize = max(vertexMapSize, canonicalMap[i] + 1);
+    }
+    
+    Array<Vertex *> vertexMap;
+    vertexMap.resize(vertexMapSize, NULL);
+
+    m_colocalVertexCount = 0;
+
+    const uint vertexCount = this->vertexCount();
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        Vertex * vertex = vertexAt(v);
+
+        Vertex * colocal = vertexMap[canonicalMap[v]];
+        if (colocal != NULL)
+        {
+            nvDebugCheck(vertex->pos == colocal->pos);
+            colocal->linkColocal(vertex);
+        }
+        else
+        {
+            vertexMap[canonicalMap[v]] = vertex;
+            m_colocalVertexCount++;
+        }
+    }
+
+    nvDebug("---   %d vertex positions.\n", m_colocalVertexCount);
+}
+
+
+Face * Mesh::addFace()
+{
+    Face * f = new Face(m_faceArray.count());
+    m_faceArray.append(f);
+    return f;
+}
+
+Face * Mesh::addFace(uint v0, uint v1, uint v2)
+{
+    Array<uint> indexArray(3);
+    indexArray << v0 << v1 << v2;
+    return addFace(indexArray, 0, 3);
+}
+
+Face * Mesh::addFace(uint v0, uint v1, uint v2, uint v3)
+{
+    Array<uint> indexArray(4);
+    indexArray << v0 << v1 << v2 << v3;
+    return addFace(indexArray, 0, 4);
+}
+
+Face * Mesh::addFace(const Array<uint> & indexArray)
+{
+    return addFace(indexArray, 0, indexArray.count());
+}
+
+
+Face * Mesh::addFace(const Array<uint> & indexArray, uint first, uint num)
+{
+    nvDebugCheck(first < indexArray.count());
+    nvDebugCheck(num <= indexArray.count()-first);
+    nvDebugCheck(num > 2);
+
+    if (!canAddFace(indexArray, first, num)) {
+        errorCount++;
+        return NULL;
+    }
+
+    Face * f = new Face(m_faceArray.count());
+
+    Edge * firstEdge = NULL;
+    Edge * last = NULL;
+    Edge * current = NULL;
+
+    for(uint i = 0; i < num-1; i++)
+    {
+        current = addEdge(indexArray[first+i], indexArray[first+i+1]);
+        nvCheck(current != NULL && current->face == NULL);
+
+        current->face = f;
+
+        if (last != NULL) last->setNext(current);
+        else firstEdge = current;
+
+        last = current;
+    }
+
+    current = addEdge(indexArray[first+num-1], indexArray[first]);
+    nvCheck(current != NULL && current->face == NULL);
+
+    current->face = f;
+
+    last->setNext(current);
+    current->setNext(firstEdge);
+
+    f->edge = firstEdge;
+    m_faceArray.append(f);
+
+    return f;
+}
+
+/*void Mesh::addFaces(const Mesh * mesh)
+{
+nvCheck(mesh != NULL);
+
+Array indexArray;
+// Add faces
+
+}*/
+
+
+// Return true if the face can be added to the manifold mesh.
+bool Mesh::canAddFace(const Array<uint> & indexArray, uint first, uint num) const
+{
+    for (uint j = num - 1, i = 0; i < num; j = i++) {
+        if (!canAddEdge(indexArray[first+j], indexArray[first+i])) {
+            errorIndex0 = indexArray[first+j];
+            errorIndex1 = indexArray[first+i];
+            return false;
+        }
+    }
+
+    // We also have to make sure the face does not have any duplicate edge!
+    for (uint i = 0; i < num; i++) {
+
+        int i0 = indexArray[first + i + 0];
+        int i1 = indexArray[first + (i + 1)%num];
+
+        for (uint j = i + 1; j < num; j++) {
+            int j0 = indexArray[first + j + 0];
+            int j1 = indexArray[first + (j + 1)%num];
+
+            if (i0 == j0 && i1 == j1) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+// Return true if the edge doesn't exist or doesn't have any adjacent face. 
+bool Mesh::canAddEdge(uint i, uint j) const
+{
+    if (i == j) {
+        // Skip degenerate edges.
+        return false;
+    }
+
+    // Same check, but taking into account colocal vertices.
+    const Vertex * v0 = vertexAt(i);
+    const Vertex * v1 = vertexAt(j);
+
+    for(Vertex::ConstVertexIterator it(v0->colocals()); !it.isDone(); it.advance())
+    {
+        if (it.current() == v1)
+        {
+            // Skip degenerate edges.
+            return false;
+        }
+    }
+
+    // Make sure edge has not been added yet.
+    Edge * edge = findEdge(i, j);
+
+    return edge == NULL || edge->face == NULL; // We ignore edges that don't have an adjacent face yet, since this face could become the edge's face.
+}
+
+Edge * Mesh::addEdge(uint i, uint j)
+{
+    nvCheck(i != j);
+
+    Edge * edge = findEdge(i, j);
+
+    if (edge != NULL) {
+        // Edge may already exist, but its face must not be set.
+        nvDebugCheck(edge->face == NULL);
+
+        // Nothing else to do!
+
+    }
+    else {
+        // Add new edge.
+
+        // Lookup pair.
+        Edge * pair = findEdge(j, i);
+
+        if (pair != NULL)
+        {
+            // Create edge with same id.
+            edge = new Edge(pair->id + 1);
+
+            // Link edge pairs.
+            edge->pair = pair;
+            pair->pair = edge;
+
+            // @@ I'm not sure this is necessary!
+            pair->vertex->setEdge(pair);
+        }
+        else
+        {
+            // Create edge.
+            edge = new Edge(2*m_edgeArray.count());
+
+            // Add only unpaired edges.
+            m_edgeArray.append(edge);
+        }
+
+        edge->vertex = m_vertexArray[i];
+        m_edgeMap.add(Key(i,j), edge);
+    }
+
+    // Face and Next are set by addFace.
+
+    return edge;
+}
+
+
+/// Find edge, test all colocals.
+Edge * Mesh::findEdge(uint i, uint j) const
+{
+    Edge * edge = NULL;
+
+    const Vertex * v0 = vertexAt(i);
+    const Vertex * v1 = vertexAt(j);
+
+    // Test all colocal pairs.
+    for(Vertex::ConstVertexIterator it0(v0->colocals()); !it0.isDone(); it0.advance())
+    {
+        for(Vertex::ConstVertexIterator it1(v1->colocals()); !it1.isDone(); it1.advance())
+        {
+            Key key(it0.current()->id, it1.current()->id);
+
+            if (edge == NULL) {
+                m_edgeMap.get(key, &edge);
+#if !defined(_DEBUG)
+                if (edge != NULL) return edge;
+#endif
+            }
+            else {
+                // Make sure that only one edge is found.
+                nvDebugCheck(!m_edgeMap.get(key));
+            }
+        }
+    }
+
+    return edge;
+}
+
+/// Link boundary edges once the mesh has been created.
+void Mesh::linkBoundary()
+{
+    nvDebug("--- Linking boundaries:\n");
+
+    int num = 0;
+
+    // Create boundary edges.
+    uint edgeCount = this->edgeCount();
+    for(uint e = 0; e < edgeCount; e++)
+    {
+        Edge * edge = edgeAt(e);
+        if (edge != NULL && edge->pair == NULL) {
+            Edge * pair = new Edge(edge->id + 1);
+
+            uint i = edge->from()->id;
+            uint j = edge->next->from()->id;
+
+            Key key(j,i);
+            nvCheck(!m_edgeMap.get(key));
+
+            pair->vertex = m_vertexArray[j];
+            m_edgeMap.add(key, pair);
+
+            edge->pair = pair;
+            pair->pair = edge;
+
+            num++;
+        }
+    }
+
+    // Link boundary edges.
+    for (uint e = 0; e < edgeCount; e++) {
+        Edge * edge = edgeAt(e);
+        if (edge != NULL && edge->pair->face == NULL) {
+            linkBoundaryEdge(edge->pair);
+        }
+    }
+
+    nvDebug("---   %d boundary edges.\n", num);
+}
+
+/// Link this boundary edge.
+void Mesh::linkBoundaryEdge(Edge * edge)
+{
+    nvCheck(edge->face == NULL);
+
+    // Make sure next pointer has not been set. @@ We want to be able to relink boundary edges after mesh changes.
+    //nvCheck(edge->next() == NULL);
+
+    Edge * next = edge;
+    while(next->pair->face != NULL) {
+        // Get pair prev
+        Edge * e = next->pair->next;
+        while (e->next != next->pair) {
+            e = e->next;
+        }
+        next = e;
+    }
+    edge->setNext(next->pair);
+
+    // Adjust vertex edge, so that it's the boundary edge. (required for isBoundary())
+    if (edge->vertex->edge != edge)
+    {
+        // Multiple boundaries in the same edge.
+        //nvCheck( edge->vertex()->edge() == NULL || edge->vertex()->edge()->face() != NULL );
+        edge->vertex->edge = edge;
+    }
+}
+
+
+/// Convert to tri mesh.
+TriMesh * Mesh::toTriMesh() const
+{
+    uint triangleCount = 0;
+
+    // Count triangle faces.
+    const uint faceCount = this->faceCount();
+    for(uint f = 0; f < faceCount; f++)
+    {
+        const Face * face = faceAt(f);
+        triangleCount += face->edgeCount() - 2;
+    }
+
+    TriMesh * triMesh = new TriMesh(triangleCount, vertexCount());
+
+    // Add vertices.
+    Array<TriMesh::Vertex> & vertices = triMesh->vertices();
+
+    const uint vertexCount = this->vertexCount();
+    for(uint v = 0; v < vertexCount; v++)
+    {
+        const Vertex * vertex = vertexAt(v);
+
+        TriMesh::Vertex triVertex;
+        triVertex.id = vertices.count();
+        triVertex.pos = vertex->pos;
+        triVertex.nor = vertex->nor;
+        triVertex.tex = vertex->tex;
+
+        vertices.append(triVertex);
+    }
+
+    // Add triangles.
+    Array<TriMesh::Face> & triangles = triMesh->faces();
+
+    for(uint f = 0; f < faceCount; f++)
+    {
+        const Face * face = faceAt(f);
+
+        // @@ Triangulate arbitrary polygons correctly.
+        const uint v0 = face->edge->vertex->id;
+        uint v1 = face->edge->next->vertex->id;
+
+        for(Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            uint v2 = it.current()->vertex->id;
+
+            // Skip the first two vertices.
+            if (v2 == v0 || v2 == v1) continue;
+
+            TriMesh::Face triangle;
+            triangle.id = triangles.count();
+            triangle.v[0] = v0;
+            triangle.v[1] = v1;
+            triangle.v[2] = v2;
+
+            v1 = v2;
+
+            triangles.append(triangle);
+        }
+    }
+
+    return triMesh;
+}
+
+QuadTriMesh * Mesh::toQuadTriMesh() const
+{
+    MeshBuilder builder;
+
+    const uint vertexCount = this->vertexCount();
+    builder.hintVertexCount(vertexCount);
+
+    for(uint v = 0; v < vertexCount; v++)
+    {
+        const Vertex * vertex = vertexAt(v);
+
+        builder.addPosition(vertex->pos);
+        builder.addNormal(vertex->nor);
+        builder.addTexCoord(vertex->tex);
+    }
+
+    const uint faceCount = this->faceCount();
+    builder.hintTriangleCount(faceCount);
+
+    for(uint f = 0; f < faceCount; f++)
+    {
+        const Face * face = faceAt(f);
+
+        builder.beginPolygon();
+
+        for(Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            uint v = it.current()->vertex->id;
+            builder.addVertex(v, v, v);
+        }
+
+        builder.endPolygon();
+    }
+
+    builder.done();
+
+    return builder.buildQuadTriMesh();
+}
+
+
+// Triangulate in place.
+void Mesh::triangulate() {
+
+    bool all_triangles = true;
+
+    const uint faceCount = m_faceArray.count();
+    for (uint f = 0; f < faceCount; f++) {
+        Face * face = m_faceArray[f];
+        if (face->edgeCount() != 3) {
+            all_triangles = false;
+            break;
+        }
+    }
+
+    if (all_triangles) {
+        return;
+    }
+
+
+    // Do not touch vertices, but rebuild edges and faces.
+    Array<Edge *> edgeArray;
+    Array<Face *> faceArray;
+
+    swap(edgeArray, m_edgeArray);
+    swap(faceArray, m_faceArray);
+    m_edgeMap.clear();
+
+    for (uint f = 0; f < faceCount; f++) {
+        Face * face = faceArray[f];
+
+        // Trivial fan-like triangulation.
+        const uint v0 = face->edge->vertex->id;
+        uint v2, v1 = -1;
+
+        for (Face::EdgeIterator it(face->edges()); !it.isDone(); it.advance()) {
+            Edge * edge = it.current();
+            v2 = edge->to()->id;
+            if (v2 == v0) break;
+            if (v1 != -1) addFace(v0, v1, v2);
+            v1 = v2;
+        }
+    }
+
+    nvDebugCheck(m_faceArray.count() > faceCount); // triangle count > face count
+
+    linkBoundary();
+
+    deleteAll(edgeArray);
+    deleteAll(faceArray);
+}
+
+
+/*
+Fixing T-junctions.
+
+- Find T-junctions. Find  vertices that are on an edge. 
+    - This test is approximate.
+    - Insert edges on a spatial index to speedup queries.
+    - Consider only open edges, that is edges that have no pairs.
+    - Consider only vertices on boundaries.
+- Close T-junction.
+    - Split edge.
+
+*/
+bool Mesh::splitBoundaryEdges() {
+    
+    Array<Vertex *> boundaryVertices;
+
+    foreach(i, m_vertexArray) {
+        Vertex * v = m_vertexArray[i];
+        if (v->isBoundary()) {
+            boundaryVertices.append(v);
+        }
+    }
+
+    nvDebug("Fixing T-junctions:\n");
+
+    int splitCount = 0;
+
+    foreach(v, boundaryVertices) {
+        Vertex * vertex = boundaryVertices[v];
+
+        Vector3 x0 = vertex->pos;
+
+        // Find edges that this vertex overlaps with.
+        foreach(e, m_edgeArray) {
+        //for (uint e = 0; e < m_edgeArray.count(); e++) {
+            Edge * edge = m_edgeArray[e];
+            if (edge != NULL && edge->isBoundary()) {
+
+                if (edge->from() == vertex || edge->to() == vertex) {
+                    continue;
+                }
+
+                Vector3 x1 = edge->from()->pos;
+                Vector3 x2 = edge->to()->pos;
+
+                Vector3 v01 = x0 - x1;
+                Vector3 v21 = x2 - x1;
+
+                float l = length(v21);
+                float d = length(cross(v01, v21)) / l;
+
+                if (isZero(d)) {
+                    float t = dot(v01, v21) / (l * l);
+
+                    // @@ Snap x0 to x1 or x2, if too close? No, do vertex snapping elsewhere.
+                    /*if (equal(t, 0.0f, 0.01f)) {
+                        //vertex->setPos(x1);
+                    }
+                    else if (equal(t, 1.0f, 0.01f)) {
+                        //vertex->setPos(x2);
+                    }
+                    else*/
+                    if (t > 0.0f + NV_EPSILON && t < 1.0f - NV_EPSILON) {
+                        nvDebugCheck(equal(lerp(x1, x2, t), x0));
+
+                        Vertex * splitVertex = splitBoundaryEdge(edge, t, x0);
+                        vertex->linkColocal(splitVertex);   // @@ Should we do this here?
+                        splitCount++;
+                    }
+                }
+            }
+        }
+    }
+
+    nvDebug(" - %d edges split.\n", splitCount);
+
+    nvDebugCheck(isValid());
+
+    return splitCount != 0;
+}
+
+
+// For this to be effective, we have to fix the boundary junctions first.
+Edge * Mesh::sewBoundary(Edge * startEdge) {
+    nvDebugCheck(startEdge->face == NULL);
+
+    // @@ We may want to be more conservative linking colocals in order to preserve the input topology. One way of doing that is by linking colocals only 
+    // if the vertices next to them are linked as well. That is, by sewing boundaries after detecting them. If any pair of consecutive edges have their first
+    // and last vertex in the same position, then it can be linked.
+
+    Edge * lastBoundarySeen = startEdge;
+
+    nvDebug("Sewing Boundary:\n");
+
+    int count = 0;
+    int sewnCount = 0;
+
+    Edge * edge = startEdge;
+    do {
+        nvDebugCheck(edge->face == NULL);
+
+        Edge * edge_a = edge;
+        Edge * edge_b = edge->prev;
+
+        Edge * pair_a = edge_a->pair;
+        Edge * pair_b = edge_b->pair;
+
+        Vertex * v0a = edge_a->to();
+        Vertex * v0b = edge_b->from();
+        Vertex * v1a = edge_a->from();
+        Vertex * v1b = edge_b->to();
+
+        nvDebugCheck(v1a->isColocal(v1b));
+
+        /*
+        v0b +      _+ v0a
+             \     /
+            b \   / a
+               \|/
+            v1b + v1a
+        */
+
+        // @@ This should not happen while sewing, but it may be produced somewhere else.
+        nvDebugCheck(edge_a != edge_b);
+
+        if (v0a->pos == v0b->pos) {
+
+            // Link vertices.
+            v0a->linkColocal(v0b);
+            
+            // Remove edges to be collapsed.
+            disconnect(edge_a);
+            disconnect(edge_b);
+            disconnect(pair_a);
+            disconnect(pair_b);
+
+            // Link new boundary edges.
+            Edge * prevBoundary = edge_b->prev;
+            Edge * nextBoundary = edge_a->next;
+            if (nextBoundary != NULL) {
+                nvDebugCheck(nextBoundary->face == NULL);
+                nvDebugCheck(prevBoundary->face == NULL);
+                nextBoundary->setPrev(prevBoundary);
+            
+                // Make sure boundary vertex points to boundary edge.
+                v0a->setEdge(nextBoundary); // This updates all colocals.
+            }
+            lastBoundarySeen = prevBoundary;
+
+            // Creat new edge.
+            Edge * newEdge_a = addEdge(v0a->id, v1a->id);   // pair_a->from()->id, pair_a->to()->id
+            Edge * newEdge_b = addEdge(v1b->id, v0b->id);
+
+            newEdge_a->pair = newEdge_b;
+            newEdge_b->pair = newEdge_a;
+
+            newEdge_a->face = pair_a->face;
+            newEdge_b->face = pair_b->face;
+
+            newEdge_a->setNext(pair_a->next);
+            newEdge_a->setPrev(pair_a->prev);
+
+            newEdge_b->setNext(pair_b->next);
+            newEdge_b->setPrev(pair_b->prev);
+
+            delete edge_a;
+            delete edge_b;
+            delete pair_a;
+            delete pair_b;
+
+            edge = nextBoundary;    // If nextBoundary is NULL we have closed the loop.
+            sewnCount++;
+        }
+        else {
+            edge = edge->next;
+        }
+        
+        count++;
+    } while(edge != NULL && edge != lastBoundarySeen);
+
+    nvDebug(" - Sewn %d out of %d.\n", sewnCount, count);
+
+    if (lastBoundarySeen != NULL) {
+        nvDebugCheck(lastBoundarySeen->face == NULL);
+    }
+
+    return lastBoundarySeen;
+}
+
+
+// @@ We must always disconnect edge pairs simultaneously.
+void Mesh::disconnect(Edge * edge) {
+    nvDebugCheck(edge != NULL);
+
+    // Remove from edge list.
+    if ((edge->id & 1) == 0) {
+        nvDebugCheck(m_edgeArray[edge->id / 2] == edge);
+        m_edgeArray[edge->id / 2] = NULL;
+    }
+
+    // Remove edge from map. @@ Store map key inside edge?
+    nvDebugCheck(edge->from() != NULL && edge->to() != NULL);
+    bool removed = m_edgeMap.remove(Key(edge->from()->id, edge->to()->id));
+    nvDebugCheck(removed == true);
+
+    // Disconnect from vertex.
+    if (edge->vertex != NULL) {
+        if (edge->vertex->edge == edge) {
+            if (edge->prev && edge->prev->pair) {
+                edge->vertex->edge = edge->prev->pair;
+            }
+            else if (edge->pair && edge->pair->next) {
+                edge->vertex->edge = edge->pair->next;
+            }
+            else {
+                edge->vertex->edge = NULL;
+                // @@ Remove disconnected vertex?
+            }
+        }
+        //edge->setVertex(NULL);
+    }
+
+    // Disconnect from face.
+    if (edge->face != NULL) {
+        if (edge->face->edge == edge) {
+            if (edge->next != NULL && edge->next != edge) {
+                edge->face->edge = edge->next;
+            }
+            else if (edge->prev != NULL && edge->prev != edge) {
+                edge->face->edge = edge->prev;
+            }
+            else {
+                edge->face->edge = NULL;
+                // @@ Remove disconnected face?
+            }
+        }
+        //edge->setFace(NULL);
+    }
+
+    // @@ Hack, we don't disconnect from pair, because pair needs us to remove itself from the map.
+    // Disconect from pair.
+    /*if (edge->pair != NULL) {
+        if (edge->pair->pair == edge) {
+            edge->pair->setPair(NULL);
+        }
+        //edge->setPair(NULL);
+    }*/
+
+    // Disconnect from previous.
+    if (edge->prev) {
+        if (edge->prev->next == edge) {
+            edge->prev->setNext(NULL);
+        }
+        //edge->setPrev(NULL);
+    }
+
+    // Disconnect from next.
+    if (edge->next) {
+        if (edge->next->prev == edge) {
+            edge->next->setPrev(NULL);
+        }
+        //edge->setNext(NULL);
+    }
+}
+
+
+void Mesh::remove(Edge * edge) {
+    nvDebugCheck(edge != NULL);
+
+    disconnect(edge);
+
+    delete edge;
+}
+
+void Mesh::remove(Vertex * vertex) {
+    nvDebugCheck(vertex != NULL);
+
+    // Remove from vertex list.
+    m_vertexArray[vertex->id] = NULL;
+
+    // Disconnect from colocals.
+    vertex->unlinkColocal();
+
+    // Disconnect from edges.
+    if (vertex->edge != NULL) {
+        // @@ Removing a connected vertex is asking for trouble...
+        if (vertex->edge->vertex == vertex) {
+            // @@ Connect edge to a colocal?
+            vertex->edge->vertex = NULL;
+        }
+
+        vertex->setEdge(NULL);
+    }
+
+    delete vertex;
+}
+
+void Mesh::remove(Face * face) {
+    nvDebugCheck(face != NULL);
+
+    // Remove from face list.
+    m_faceArray[face->id] = NULL;
+
+    // Disconnect from edges.
+    if (face->edge != NULL) {
+        nvDebugCheck(face->edge->face == face);
+
+        face->edge->face = NULL;
+
+        face->edge = NULL;
+    }
+
+    delete face;
+}
+
+
+void Mesh::compactEdges() {
+    const uint edgeCount = m_edgeArray.count();
+
+    uint c = 0;
+    for (uint i = 0; i < edgeCount; i++) {
+        if (m_edgeArray[i] != NULL) {
+            if (i != c) {
+                m_edgeArray[c] = m_edgeArray[i];
+                m_edgeArray[c]->id = 2 * c;
+                if (m_edgeArray[c]->pair != NULL) {
+                    m_edgeArray[c]->pair->id = 2 * c + 1;
+                }
+            }
+            c++;
+        }
+    }
+
+    m_edgeArray.resize(c);
+}
+
+
+void Mesh::compactVertices() {
+    const uint vertexCount = m_vertexArray.count();
+
+    uint c = 0;
+    for (uint i = 0; i < vertexCount; i++) {
+        if (m_vertexArray[i] != NULL) {
+            if (i != c) {
+                m_vertexArray[c] = m_vertexArray[i];
+                m_vertexArray[c]->id = c;
+            }
+            c++;
+        }
+    }
+
+    m_vertexArray.resize(c);
+
+    // @@ Generate xref array for external attributes.
+}
+
+
+void Mesh::compactFaces() {
+    const uint faceCount = m_faceArray.count();
+
+    uint c = 0;
+    for (uint i = 0; i < faceCount; i++) {
+        if (m_faceArray[i] != NULL) {
+            if (i != c) {
+                m_faceArray[c] = m_faceArray[i];
+                m_faceArray[c]->id = c;
+            }
+            c++;
+        }
+    }
+
+    m_faceArray.resize(c);
+}
+
+
+Vertex * Mesh::splitBoundaryEdge(Edge * edge, float t, const Vector3 & pos) {
+
+    /*
+      We want to go from this configuration:
+           
+            +   +
+            |   ^
+       edge |<->|  pair
+            v   |
+            +   +
+      
+      To this one:
+
+            +   +
+            |   ^
+         e0 |<->| p0
+            v   |
+     vertex +   + 
+            |   ^
+         e1 |<->| p1
+            v   |
+            +   +
+
+    */
+
+
+    Edge * pair = edge->pair;
+
+    // Make sure boundaries are linked.
+    nvDebugCheck(pair != NULL); 
+
+    // Make sure edge is a boundary edge.
+    nvDebugCheck(pair->face == NULL);
+
+    // Add new vertex.
+    Vertex * vertex = addVertex(pos);
+    vertex->nor = lerp(edge->from()->nor, edge->to()->nor, t);
+    vertex->tex = lerp(edge->from()->tex, edge->to()->tex, t);
+    vertex->col = lerp(edge->from()->col, edge->to()->col, t);
+
+    disconnect(edge);
+    disconnect(pair);
+
+    // Add edges.
+    Edge * e0 = addEdge(edge->from()->id, vertex->id);
+    Edge * p0 = addEdge(vertex->id, pair->to()->id);
+
+    Edge * e1 = addEdge(vertex->id, edge->to()->id);
+    Edge * p1 = addEdge(pair->from()->id, vertex->id);
+
+    // Link edges.
+    e0->setNext(e1);
+    p1->setNext(p0);
+
+    e0->setPrev(edge->prev);
+    e1->setNext(edge->next);
+
+    p1->setPrev(pair->prev);
+    p0->setNext(pair->next);
+
+    nvDebugCheck(e0->next == e1);
+    nvDebugCheck(e1->prev == e0);
+
+    nvDebugCheck(p1->next == p0);
+    nvDebugCheck(p0->prev == p1);
+
+    nvDebugCheck(p0->pair == e0);
+    nvDebugCheck(e0->pair == p0);
+
+    nvDebugCheck(p1->pair == e1);
+    nvDebugCheck(e1->pair == p1);
+
+    // Link faces.
+    e0->face = edge->face;
+    e1->face = edge->face;
+
+    // Link vertices.
+    edge->from()->setEdge(e0);
+    vertex->setEdge(e1);
+
+    delete edge;
+    delete pair;
+
+    return vertex;
+}
+
+#if 0
+// Without introducing new vertices.
+void Mesh::splitBoundaryEdge(Edge * edge, Vertex * vertex) {
+
+    /*
+      We want to go from this configuration:
+
+            |   | pn
+            +   +
+            |   ^
+            |   |
+       edge |<->| pair
+            |   |
+            v   |
+            +   +
+            |   | pp
+      
+      To this one:
+          \       /
+           \     /
+            +   +
+            |   ^
+         e0 |<->| p0
+            v   |
+     vertex +   + 
+            |   ^
+         e1 |<->| p1
+            v   |
+            +   +
+           /     \
+          /       \
+    */
+
+
+    Edge * pair = edge->pair;
+    Edge * pn = pair->next();
+    Edge * pp = pair->prev();
+
+    // Make sure boundaries are linked.
+    nvDebugCheck(pair != NULL);
+
+    // Make sure edge is a boundary edge.
+    nvDebugCheck(pair->face() == NULL);
+
+    nvDebugCheck(edge->isValid());
+    nvDebugCheck(pair->isValid());
+
+    disconnect(edge);
+    disconnect(pair);
+
+    // Add edges.
+    Edge * e0 = addEdge(edge->from()->id(), vertex->id());
+    Edge * e1 = addEdge(vertex->id(), edge->to()->id());
+
+    // Link faces.
+    e0->setFace(edge->face());
+    e1->setFace(edge->face());
+
+    // Link pairs.
+    Edge * p0 = findEdge(vertex->id(), pair->to()->id());
+    if (p0 == NULL) {
+        p0 = addEdge(vertex->id(), pair->to()->id());
+        pn->setPrev(p0);
+    }
+    else {
+        nvDebugCheck(p0->face() != NULL);
+        if (e0->prev() != NULL) {
+            pn->setPrev(e0->prev());
+        }
+        else {
+            nvDebugCheck(pn == e0);
+        }
+    }
+    
+    Edge * p1 = findEdge(pair->from()->id(), vertex->id());
+    if (p1 == NULL) {
+        p1 = addEdge(pair->from()->id(), vertex->id());
+        pp->setNext(p1);
+    }
+    else {
+        nvDebugCheck(p1->face() != NULL);
+        if (e1->next() != NULL) {
+            pp->setPrev(e1->next());
+        }
+        else {
+            nvDebugCheck(pp == e1);
+        }
+    }
+
+    // Link edges.
+    e0->setNext(e1); // e1->setPrev(e0)
+
+    if (p0->face() == p1->face()) { // can be null
+        p1->setNext(p0); // p0->setPrev(p1)
+    }
+    else {
+        //if (p1->face() == NULL) p1->setNext(
+    }
+    
+
+    e0->setPrev(edge->prev());
+    e1->setNext(edge->next());
+
+    nvDebugCheck(e0->pair == p0);
+    nvDebugCheck(e1->pair == p1);
+    nvDebugCheck(p0->pair == e0);
+    nvDebugCheck(p1->pair == e1);
+
+    nvDebugCheck(e0->isValid());
+    nvDebugCheck(e1->isValid());
+    nvDebugCheck(pp->isValid());
+    nvDebugCheck(pn->isValid());
+
+    nvDebugCheck(e0->pair->isValid());
+    nvDebugCheck(e1->pair->isValid());
+    nvDebugCheck(pp->pair->isValid());
+    nvDebugCheck(pn->pair->isValid());
+
+    nvDebugCheck(edge->face->isValid());
+
+    if (pn->pair->face != NULL) {
+        nvDebugCheck(pn->pair->face->isValid());
+    }
+
+    if (pp->pair->face() != NULL) {
+        nvDebugCheck(pn->pair->face->isValid());
+    }
+
+    if (p0->face != NULL) {
+        nvDebugCheck(p0->face->isValid());
+    }
+
+    if (p1->face() != NULL) {
+        nvDebugCheck(p1->face()->isValid());
+    }
+
+    nvDebugCheck(isValid()); // Only for extreme debugging.
+
+    // Link vertices.
+    edge->from()->setEdge(e0);
+    vertex->setEdge(p0);
+
+    delete edge;
+    delete pair;
+}
+#endif
+
+bool Mesh::isValid() const
+{
+    // Make sure all edges are valid.
+    const uint edgeCount = m_edgeArray.count();
+    for (uint e = 0; e < edgeCount; e++) {
+        Edge * edge = m_edgeArray[e];
+        if (edge != NULL) {
+            if (edge->id != 2*e) {
+                return false;
+            }
+            if (!edge->isValid()) {
+                return false;
+            }
+
+            if (edge->pair->id != 2*e+1) {
+                return false;
+            }
+            if (!edge->pair->isValid()) {
+                return false;
+            }
+        }
+    }
+
+    // @@ Make sure all faces are valid.
+
+    // @@ Make sure all vertices are valid.
+
+    return true;
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Mesh.h b/thirdparty/thekla_atlas/nvmesh/halfedge/Mesh.h
new file mode 100644
index 0000000000..c202c2ef9a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Mesh.h
@@ -0,0 +1,274 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_HALFEDGE_MESH_H
+#define NV_MESH_HALFEDGE_MESH_H
+
+#include "nvmesh/nvmesh.h"
+#include "nvcore/Array.h"
+#include "nvcore/HashMap.h"
+
+/*
+If I were to redo this again, there are a number of things that I would do differently.
+- Edge map is only useful when importing a mesh to guarantee the result is two-manifold. However, when manipulating the mesh
+  it's a pain to maintain the map up to date.
+- Edge array only points to the even vertices. There's no good reason for that. The map becomes required to traverse all edges
+  or you have to make sure edges are properly paired.
+- Linked boundaries. It's cleaner to assume a NULL pair means a boundary edge. Makes easier to seal boundaries. The only reason
+  why we link boundaries is to simplify traversal, but that could be done with two helper functions (nextBoundary, prevBoundary).
+- Minimize the amount of state that needs to be set in a certain way:
+    - boundary vertices point to boundary edge.
+- Remove parenthesis! Make some members public.
+- Remove member functions with side effects:
+    - e->setNext(n) modifies e->next and n->prev, instead use "link(e, n)", or "e->next = n, n->prev = e"
+*/
+
+
+namespace nv
+{
+    class Vector3;
+    class TriMesh;
+    class QuadTriMesh;
+    //template <typename T> struct Hash<Mesh::Key>;
+
+    namespace HalfEdge
+    {
+        class Edge;
+        class Face;
+        class Vertex;
+
+        /// Simple half edge mesh designed for dynamic mesh manipulation.
+        class Mesh
+        {
+        public:
+
+            Mesh();
+            Mesh(const Mesh * mesh);
+            ~Mesh();
+
+            void clear();
+
+            Vertex * addVertex(const Vector3 & pos);
+            //Vertex * addVertex(uint id, const Vector3 & pos);
+            //void addVertices(const Mesh * mesh);
+
+            void linkColocals();
+            void linkColocalsWithCanonicalMap(const Array<uint> & canonicalMap);
+            void resetColocalLinks();
+
+            Face * addFace();
+            Face * addFace(uint v0, uint v1, uint v2);
+            Face * addFace(uint v0, uint v1, uint v2, uint v3);
+            Face * addFace(const Array<uint> & indexArray);
+            Face * addFace(const Array<uint> & indexArray, uint first, uint num);
+            //void addFaces(const Mesh * mesh);
+
+            // These functions disconnect the given element from the mesh and delete it.
+            void disconnect(Edge * edge);
+            void disconnectPair(Edge * edge);
+            void disconnect(Vertex * vertex);
+            void disconnect(Face * face);
+
+            void remove(Edge * edge);
+            void remove(Vertex * vertex);
+            void remove(Face * face);
+
+            // Remove holes from arrays and reassign indices.
+            void compactEdges();
+            void compactVertices();
+            void compactFaces();
+
+            void triangulate();
+
+            void linkBoundary();
+            
+            bool splitBoundaryEdges(); // Returns true if any split was made.
+
+            // Sew the boundary that starts at the given edge, returns one edge that still belongs to boundary, or NULL if boundary closed.
+            HalfEdge::Edge * sewBoundary(Edge * startEdge);
+
+
+            // Vertices
+            uint vertexCount() const { return m_vertexArray.count(); }
+            const Vertex * vertexAt(int i) const { return m_vertexArray[i]; }
+            Vertex * vertexAt(int i) { return m_vertexArray[i]; }
+
+            uint colocalVertexCount() const { return m_colocalVertexCount; }
+
+            // Faces
+            uint faceCount() const { return m_faceArray.count(); }
+            const Face * faceAt(int i) const { return m_faceArray[i]; }
+            Face * faceAt(int i) { return m_faceArray[i]; }
+
+            // Edges
+            uint edgeCount() const { return m_edgeArray.count();  }
+            const Edge * edgeAt(int i) const { return m_edgeArray[i]; }
+            Edge * edgeAt(int i) { return m_edgeArray[i]; }
+
+            class ConstVertexIterator;
+
+            class VertexIterator
+            {
+                friend class ConstVertexIterator;
+            public:
+                VertexIterator(Mesh * mesh) : m_mesh(mesh), m_current(0) { }
+
+                virtual void advance() { m_current++; }
+                virtual bool isDone() const { return m_current == m_mesh->vertexCount(); }
+                virtual Vertex * current() const { return m_mesh->vertexAt(m_current); }
+
+            private:
+                HalfEdge::Mesh * m_mesh;
+                uint m_current;
+            };
+            VertexIterator vertices() { return VertexIterator(this); }
+
+            class ConstVertexIterator
+            {
+            public:
+                ConstVertexIterator(const Mesh * mesh) : m_mesh(mesh), m_current(0) { }
+                ConstVertexIterator(class VertexIterator & it) : m_mesh(it.m_mesh), m_current(it.m_current) { }
+
+                virtual void advance() { m_current++; }
+                virtual bool isDone() const { return m_current == m_mesh->vertexCount(); }
+                virtual const Vertex * current() const { return m_mesh->vertexAt(m_current); }
+
+            private:
+                const HalfEdge::Mesh * m_mesh;
+                uint m_current;
+            };
+            ConstVertexIterator vertices() const { return ConstVertexIterator(this); }
+
+            class ConstFaceIterator;
+
+            class FaceIterator
+            {
+                friend class ConstFaceIterator;
+            public:
+                FaceIterator(Mesh * mesh) : m_mesh(mesh), m_current(0) { }
+
+                virtual void advance() { m_current++; }
+                virtual bool isDone() const { return m_current == m_mesh->faceCount(); }
+                virtual Face * current() const { return m_mesh->faceAt(m_current); }
+
+            private:
+                HalfEdge::Mesh * m_mesh;
+                uint m_current;
+            };
+            FaceIterator faces() { return FaceIterator(this); }
+
+            class ConstFaceIterator
+            {
+            public:
+                ConstFaceIterator(const Mesh * mesh) : m_mesh(mesh), m_current(0) { }
+                ConstFaceIterator(const FaceIterator & it) : m_mesh(it.m_mesh), m_current(it.m_current) { }
+
+                virtual void advance() { m_current++; }
+                virtual bool isDone() const { return m_current == m_mesh->faceCount(); }
+                virtual const Face * current() const { return m_mesh->faceAt(m_current); }
+
+            private:
+                const HalfEdge::Mesh * m_mesh;
+                uint m_current;
+            };
+            ConstFaceIterator faces() const { return ConstFaceIterator(this); }
+
+            class ConstEdgeIterator;
+
+            class EdgeIterator
+            {
+                friend class ConstEdgeIterator;
+            public:
+                EdgeIterator(Mesh * mesh) : m_mesh(mesh), m_current(0) { }
+
+                virtual void advance() { m_current++; }
+                virtual bool isDone() const { return m_current == m_mesh->edgeCount(); }
+                virtual Edge * current() const { return m_mesh->edgeAt(m_current); }
+
+            private:
+                HalfEdge::Mesh * m_mesh;
+                uint m_current;
+            };
+            EdgeIterator edges() { return EdgeIterator(this); }
+
+            class ConstEdgeIterator
+            {
+            public:
+                ConstEdgeIterator(const Mesh * mesh) : m_mesh(mesh), m_current(0) { }
+                ConstEdgeIterator(const EdgeIterator & it) : m_mesh(it.m_mesh), m_current(it.m_current) { }
+
+                virtual void advance() { m_current++; }
+                virtual bool isDone() const { return m_current == m_mesh->edgeCount(); }
+                virtual const Edge * current() const { return m_mesh->edgeAt(m_current); }
+
+            private:
+                const HalfEdge::Mesh * m_mesh;
+                uint m_current;
+            };
+            ConstEdgeIterator edges() const { return ConstEdgeIterator(this); }
+
+            // @@ Add half-edge iterator.
+
+
+
+            // Convert to tri mesh.
+            TriMesh * toTriMesh() const;
+            QuadTriMesh * toQuadTriMesh() const;
+
+            bool isValid() const;
+
+        public:
+
+            // Error status:
+            mutable uint errorCount;
+            mutable uint errorIndex0;
+            mutable uint errorIndex1;
+
+        private:
+
+            bool canAddFace(const Array<uint> & indexArray, uint first, uint num) const;
+            bool canAddEdge(uint i, uint j) const;
+            Edge * addEdge(uint i, uint j);
+
+            Edge * findEdge(uint i, uint j) const;
+
+            void linkBoundaryEdge(Edge * edge);
+            Vertex * splitBoundaryEdge(Edge * edge, float t, const Vector3 & pos);
+            void splitBoundaryEdge(Edge * edge, Vertex * vertex);
+
+        private:
+
+            Array<Vertex *> m_vertexArray;
+            Array<Edge *> m_edgeArray;
+            Array<Face *> m_faceArray;
+
+            struct Key {
+                Key() {}
+                Key(const Key & k) : p0(k.p0), p1(k.p1) {}
+                Key(uint v0, uint v1) : p0(v0), p1(v1) {}
+                void operator=(const Key & k) { p0 = k.p0; p1 = k.p1; }
+                bool operator==(const Key & k) const { return p0 == k.p0 && p1 == k.p1; }
+
+                uint p0;
+                uint p1;
+            };
+            friend struct Hash<Mesh::Key>;
+
+            HashMap<Key, Edge *> m_edgeMap;
+
+            uint m_colocalVertexCount;
+
+        };
+        /*
+        // This is a much better hash than the default and greatly improves performance!
+        template <> struct hash<Mesh::Key>
+        {
+        uint operator()(const Mesh::Key & k) const { return k.p0 + k.p1; }
+        };
+        */
+
+    } // HalfEdge namespace
+
+} // nv namespace
+
+#endif // NV_MESH_HALFEDGE_MESH_H
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Vertex.cpp b/thirdparty/thekla_atlas/nvmesh/halfedge/Vertex.cpp
new file mode 100644
index 0000000000..66dad69f8a
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Vertex.cpp
@@ -0,0 +1,94 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "nvmesh.h" // pch
+
+#include "Vertex.h"
+
+#include "nvmath/Vector.inl"
+
+using namespace nv;
+using namespace HalfEdge;
+
+
+// Set first edge of all colocals.
+void Vertex::setEdge(Edge * e)
+{
+    for (VertexIterator it(colocals()); !it.isDone(); it.advance()) { 
+        it.current()->edge = e;
+    }
+}
+
+// Update position of all colocals.
+void Vertex::setPos(const Vector3 & p)
+{
+    for (VertexIterator it(colocals()); !it.isDone(); it.advance()) {
+        it.current()->pos = p;
+    }
+}
+
+
+uint HalfEdge::Vertex::colocalCount() const
+{
+    uint count = 0;
+    for (ConstVertexIterator it(colocals()); !it.isDone(); it.advance()) { ++count; }
+    return count;
+}
+
+uint HalfEdge::Vertex::valence() const
+{
+    uint count = 0;
+    for (ConstEdgeIterator it(edges()); !it.isDone(); it.advance()) { ++count; }
+    return count;
+}
+
+const HalfEdge::Vertex * HalfEdge::Vertex::firstColocal() const
+{
+    uint firstId = id;
+    const Vertex * vertex = this;
+
+    for (ConstVertexIterator it(colocals()); !it.isDone(); it.advance())
+    {
+        if (it.current()->id < firstId) {
+            firstId = vertex->id;
+            vertex = it.current();
+        }
+    }
+
+    return vertex;
+}
+
+HalfEdge::Vertex * HalfEdge::Vertex::firstColocal()
+{
+    Vertex * vertex = this;
+    uint firstId = id;
+
+    for (VertexIterator it(colocals()); !it.isDone(); it.advance())
+    {
+        if (it.current()->id < firstId) {
+            firstId = vertex->id;
+            vertex = it.current();
+        }
+    }
+
+    return vertex;
+}
+
+bool HalfEdge::Vertex::isFirstColocal() const
+{
+    return firstColocal() == this;
+}
+
+bool HalfEdge::Vertex::isColocal(const Vertex * v) const {
+    if (this == v) return true;
+    if (pos != v->pos) return false;
+
+    for (ConstVertexIterator it(colocals()); !it.isDone(); it.advance())
+    {
+        if (v == it.current()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
diff --git a/thirdparty/thekla_atlas/nvmesh/halfedge/Vertex.h b/thirdparty/thekla_atlas/nvmesh/halfedge/Vertex.h
new file mode 100644
index 0000000000..1c5c8d7141
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/halfedge/Vertex.h
@@ -0,0 +1,221 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_HALFEDGE_VERTEX_H
+#define NV_MESH_HALFEDGE_VERTEX_H
+
+#include "nvmesh/halfedge/Edge.h"
+
+namespace nv
+{
+    namespace HalfEdge { class Vertex; class Face; class Edge; }
+
+    // Half edge vertex.
+    class HalfEdge::Vertex
+    {
+        NV_FORBID_COPY(Vertex);
+    public:
+
+        uint id;
+
+        Edge * edge;
+        Vertex * next;
+        Vertex * prev;
+
+        Vector3 pos;
+        Vector3 nor;
+        Vector2 tex;
+        Vector4 col;
+
+
+        Vertex(uint id) : id(id), edge(NULL), pos(0.0f), nor(0.0f), tex(0.0f), col(0.0f) {
+            next = this;
+            prev = this;
+        }
+
+
+        void setEdge(Edge * e);
+        void setPos(const Vector3 & p);
+
+        uint colocalCount() const;
+        uint valence() const;
+        bool isFirstColocal() const;
+        const Vertex * firstColocal() const;
+        Vertex * firstColocal();
+
+        bool isColocal(const Vertex * v) const;
+
+        
+        void linkColocal(Vertex * v) {
+            next->prev = v;
+            v->next = next; 
+            next = v;
+            v->prev = this;
+        }
+        void unlinkColocal() {
+            next->prev = prev;
+            prev->next = next;
+            next = this;
+            prev = this;
+        }
+
+
+        // @@ Note: This only works if linkBoundary has been called.
+        bool isBoundary() const {
+            return (edge && !edge->face);
+        }
+
+
+        //	for(EdgeIterator it(iterator()); !it.isDone(); it.advance()) { ... }
+        //
+        //	EdgeIterator it(iterator());
+        //	while(!it.isDone()) {
+        //		...
+        //		id.advance(); 
+        //	}
+
+        // Iterator that visits the edges around this vertex in counterclockwise order.
+        class EdgeIterator //: public Iterator<Edge *>
+        {
+        public:
+            EdgeIterator(Edge * e) : m_end(NULL), m_current(e) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->pair->next;
+                //m_current = m_current->prev->pair;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual Edge * current() const { return m_current; }
+            Vertex * vertex() const { return m_current->vertex; }
+
+        private:
+            Edge * m_end;
+            Edge * m_current;
+        };
+
+        EdgeIterator edges() { return EdgeIterator(edge); }
+        EdgeIterator edges(Edge * e) { return EdgeIterator(e); }
+
+        // Iterator that visits the edges around this vertex in counterclockwise order.
+        class ConstEdgeIterator //: public Iterator<Edge *>
+        {
+        public:
+            ConstEdgeIterator(const Edge * e) : m_end(NULL), m_current(e) { }
+            ConstEdgeIterator(EdgeIterator it) : m_end(NULL), m_current(it.current()) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->pair->next;
+                //m_current = m_current->prev->pair;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual const Edge * current() const { return m_current; }
+            const Vertex * vertex() const { return m_current->to(); }
+
+        private:
+            const Edge * m_end;
+            const Edge * m_current;
+        };
+
+        ConstEdgeIterator edges() const { return ConstEdgeIterator(edge); }
+        ConstEdgeIterator edges(const Edge * e) const { return ConstEdgeIterator(e); }
+
+
+        // Iterator that visits the edges around this vertex in counterclockwise order.
+        class ReverseEdgeIterator //: public Iterator<Edge *>
+        {
+        public:
+            ReverseEdgeIterator(Edge * e) : m_end(NULL), m_current(e) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->prev->pair;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual Edge * current() const { return m_current; }
+            Vertex * vertex() const { return m_current->vertex; }
+
+        private:
+            Edge * m_end;
+            Edge * m_current;
+        };
+
+        // Iterator that visits the edges around this vertex in counterclockwise order.
+        class ReverseConstEdgeIterator //: public Iterator<Edge *>
+        {
+        public:
+            ReverseConstEdgeIterator(const Edge * e) : m_end(NULL), m_current(e) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->prev->pair;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual const Edge * current() const { return m_current; }
+            const Vertex * vertex() const { return m_current->to(); }
+
+        private:
+            const Edge * m_end;
+            const Edge * m_current;
+        };
+
+
+
+        // Iterator that visits all the colocal vertices.
+        class VertexIterator //: public Iterator<Edge *>
+        {
+        public:
+            VertexIterator(Vertex * v) : m_end(NULL), m_current(v) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->next;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual Vertex * current() const { return m_current; }
+
+        private:
+            Vertex * m_end;
+            Vertex * m_current;
+        };
+
+        VertexIterator colocals() { return VertexIterator(this); }
+
+        // Iterator that visits all the colocal vertices.
+        class ConstVertexIterator //: public Iterator<Edge *>
+        {
+        public:
+            ConstVertexIterator(const Vertex * v) : m_end(NULL), m_current(v) { }
+
+            virtual void advance()
+            {
+                if (m_end == NULL) m_end = m_current;
+                m_current = m_current->next;
+            }
+
+            virtual bool isDone() const { return m_end == m_current; }
+            virtual const Vertex * current() const { return m_current; }
+
+        private:
+            const Vertex * m_end;
+            const Vertex * m_current;
+        };
+
+        ConstVertexIterator colocals() const { return ConstVertexIterator(this); }
+
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_HALFEDGE_VERTEX_H
diff --git a/thirdparty/thekla_atlas/nvmesh/nvmesh.cpp b/thirdparty/thekla_atlas/nvmesh/nvmesh.cpp
new file mode 100644
index 0000000000..d007eda332
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/nvmesh.cpp
@@ -0,0 +1,2 @@
+#include "nvmesh.h" // pch
+
diff --git a/thirdparty/thekla_atlas/nvmesh/nvmesh.h b/thirdparty/thekla_atlas/nvmesh/nvmesh.h
new file mode 100644
index 0000000000..eb6819675d
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/nvmesh.h
@@ -0,0 +1,34 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_H
+#define NV_MESH_H
+
+#include "nvcore/nvcore.h"
+
+// Function linkage
+#if NVMESH_SHARED
+#ifdef NVMESH_EXPORTS
+#define NVMESH_API DLL_EXPORT
+#define NVMESH_CLASS DLL_EXPORT_CLASS
+#else
+#define NVMESH_API DLL_IMPORT
+#define NVMESH_CLASS DLL_IMPORT
+#endif
+#else
+#define NVMESH_API
+#define NVMESH_CLASS
+#endif
+
+#if 1 //USE_PRECOMPILED_HEADERS // If using precompiled headers:
+//#include <string.h> // strlen, strcmp, etc.
+//#include "nvcore/StrLib.h"
+//#include "nvcore/StdStream.h"
+//#include "nvcore/Memory.h"
+//#include "nvcore/Debug.h"
+//#include "nvmath/Vector.h"
+//#include "nvcore/Array.h"
+//#include "nvcore/HashMap.h"
+#endif
+
+#endif // NV_MESH_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/Atlas.cpp b/thirdparty/thekla_atlas/nvmesh/param/Atlas.cpp
new file mode 100644
index 0000000000..98f92cef96
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/Atlas.cpp
@@ -0,0 +1,1519 @@
+// Copyright NVIDIA Corporation 2006 -- Ignacio Castano <icastano@nvidia.com>
+
+#include "nvmesh.h" // pch
+
+#include "Atlas.h"
+#include "Util.h"
+#include "AtlasBuilder.h"
+#include "AtlasPacker.h"
+#include "SingleFaceMap.h"
+#include "OrthogonalProjectionMap.h"
+#include "LeastSquaresConformalMap.h"
+#include "ParameterizationQuality.h"
+
+//#include "nvmesh/export/MeshExportOBJ.h"
+
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Face.h"
+#include "nvmesh/halfedge/Vertex.h"
+
+#include "nvmesh/MeshBuilder.h"
+#include "nvmesh/MeshTopology.h"
+#include "nvmesh/param/Util.h"
+#include "nvmesh/geometry/Measurements.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+#include "nvmath/Box.inl"
+#include "nvmath/ProximityGrid.h"
+#include "nvmath/Morton.h"
+
+#include "nvcore/StrLib.h"
+#include "nvcore/Array.inl"
+#include "nvcore/HashMap.inl"
+
+using namespace nv;
+
+
+/// Ctor.
+Atlas::Atlas()
+{
+    failed=false;
+}
+
+// Dtor.
+Atlas::~Atlas()
+{
+    deleteAll(m_meshChartsArray);
+}
+
+uint Atlas::chartCount() const
+{
+    uint count = 0;
+    foreach(c, m_meshChartsArray) {
+        count += m_meshChartsArray[c]->chartCount();
+    }
+    return count;
+}
+
+const Chart * Atlas::chartAt(uint i) const
+{
+    foreach(c, m_meshChartsArray) {
+        uint count = m_meshChartsArray[c]->chartCount();
+
+        if (i < count) {
+            return m_meshChartsArray[c]->chartAt(i);
+        }
+
+        i -= count;
+    }
+
+    return NULL;
+}
+
+Chart * Atlas::chartAt(uint i) 
+{
+    foreach(c, m_meshChartsArray) {
+        uint count = m_meshChartsArray[c]->chartCount();
+
+        if (i < count) {
+            return m_meshChartsArray[c]->chartAt(i);
+        }
+
+        i -= count;
+    }
+
+    return NULL;
+}
+
+// Extract the charts and add to this atlas.
+void Atlas::addMeshCharts(MeshCharts * meshCharts)
+{
+    m_meshChartsArray.append(meshCharts);
+}
+
+void Atlas::extractCharts(const HalfEdge::Mesh * mesh)
+{
+    MeshCharts * meshCharts = new MeshCharts(mesh);
+    meshCharts->extractCharts();
+    addMeshCharts(meshCharts);
+}
+
+void Atlas::computeCharts(const HalfEdge::Mesh * mesh, const SegmentationSettings & settings, const Array<uint> & unchartedMaterialArray)
+{
+    failed=false;
+    MeshCharts * meshCharts = new MeshCharts(mesh);
+    meshCharts->computeCharts(settings, unchartedMaterialArray);
+    addMeshCharts(meshCharts);
+}
+
+
+
+
+#if 0
+
+/// Compute a seamless texture atlas.
+bool Atlas::computeSeamlessTextureAtlas(bool groupFaces/*= true*/, bool scaleTiles/*= false*/, uint w/*= 1024*/, uint h/* = 1024*/)
+{
+    // Implement seamless texture atlas similar to what ZBrush does. See also:
+    // "Meshed Atlases for Real-Time Procedural Solid Texturing"
+    // http://graphics.cs.uiuc.edu/~jch/papers/rtpst.pdf
+
+    // Other methods that we should experiment with:
+    // 
+    // Seamless Texture Atlases:
+    // http://www.cs.jhu.edu/~bpurnomo/STA/index.html
+    // 
+    // Rectangular Multi-Chart Geometry Images:
+    // http://graphics.cs.uiuc.edu/~jch/papers/rmcgi.pdf
+    // 
+    // Discrete differential geometry also provide a way of constructing  
+    // seamless quadrangulations as shown in:
+    // http://www.geometry.caltech.edu/pubs/TACD06.pdf
+    // 
+
+#pragma message(NV_FILE_LINE "TODO: Implement seamless texture atlas.")
+
+    if (groupFaces)
+    {
+        // @@ TODO.
+    }
+    else
+    {
+        // @@ Create one atlas per face.
+    }
+
+    if (scaleTiles)
+    {
+        // @@ TODO
+    }
+
+    /*
+    if (!isQuadMesh(m_mesh)) {
+        // Only handle quads for now.
+        return false;
+    }
+
+    // Each face is a chart.
+    const uint faceCount = m_mesh->faceCount();
+    m_chartArray.resize(faceCount);
+
+    for(uint f = 0; f < faceCount; f++) {
+        m_chartArray[f].faceArray.clear();
+        m_chartArray[f].faceArray.append(f);
+    }
+
+    // Map each face to a separate square.
+
+    // Determine face layout according to width and height.
+    float aspect = float(m_width) / float(m_height);
+
+    uint i = 2;
+    uint total = (m_width / (i+1)) * (m_height / (i+1));
+    while(total > faceCount) {
+        i *= 2;
+        total = (m_width / (i+1)) * (m_height / (i+1));
+    }
+
+    uint tileSize = i / 2;
+
+    int x = 0;
+    int y = 0;
+
+    m_result = new HalfEdge::Mesh();
+
+    // Once you have that it's just matter of traversing the faces.
+    for(uint f = 0; f < faceCount; f++) {
+        // Compute texture coordinates.
+        Vector2 tex[4];
+        tex[0] = Vector2(float(x), float(y));
+        tex[1] = Vector2(float(x+tileSize), float(y));
+        tex[2] = Vector2(float(x+tileSize), float(y+tileSize));
+        tex[3] = Vector2(float(x), float(y+tileSize));
+
+        Array<uint> indexArray(4);
+
+        const HalfEdge::Face * face = m_mesh->faceAt(f);
+
+        int i = 0;
+        for(HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance(), i++) {
+            const HalfEdge::Edge * edge = it.current();
+            const HalfEdge::Vertex * vertex = edge->from();
+
+            HalfEdge::Vertex * newVertex = m_result->addVertex(vertex->id(), vertex->pos());
+
+            newVertex->setTex(Vector3(tex[i], 0));
+            newVertex->setNor(vertex->nor());
+
+            indexArray.append(m_result->vertexCount() + 1);
+        }
+
+        m_result->addFace(indexArray);
+
+        // Move to the next tile.
+        x += tileSize + 1;
+        if (x + tileSize > m_width) {
+            x = 0;
+            y += tileSize + 1;
+        }
+    }
+    */
+
+    return false;
+}
+
+#endif
+
+
+void Atlas::parameterizeCharts()
+{
+    foreach(i, m_meshChartsArray) {
+        m_meshChartsArray[i]->parameterizeCharts();
+    }
+}
+
+
+float Atlas::packCharts(int quality, float texelsPerUnit, bool blockAlign, bool conservative)
+{
+    AtlasPacker packer(this);
+    packer.packCharts(quality, texelsPerUnit, blockAlign, conservative);
+    if (hasFailed())
+        return 0;
+    return packer.computeAtlasUtilization();
+}
+
+
+
+
+/// Ctor.
+MeshCharts::MeshCharts(const HalfEdge::Mesh * mesh) : m_mesh(mesh)
+{
+}
+
+// Dtor.
+MeshCharts::~MeshCharts()
+{
+    deleteAll(m_chartArray);
+}
+
+
+void MeshCharts::extractCharts()
+{
+    const uint faceCount = m_mesh->faceCount();
+
+    int first = 0;
+    Array<uint> queue(faceCount);
+
+    BitArray bitFlags(faceCount);
+    bitFlags.clearAll();
+
+    for (uint f = 0; f < faceCount; f++)
+    {
+        if (bitFlags.bitAt(f) == false)
+        {
+            // Start new patch. Reset queue.
+            first = 0;
+            queue.clear();
+            queue.append(f);
+            bitFlags.setBitAt(f);
+
+            while (first != queue.count())
+            {
+                const HalfEdge::Face * face = m_mesh->faceAt(queue[first]);
+
+                // Visit face neighbors of queue[first]
+                for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+                {
+                    const HalfEdge::Edge * edge = it.current();
+                    nvDebugCheck(edge->pair != NULL);
+
+                    if (!edge->isBoundary() && /*!edge->isSeam()*/ 
+                        //!(edge->from()->tex() != edge->pair()->to()->tex() || edge->to()->tex() != edge->pair()->from()->tex()))
+                        !(edge->from() != edge->pair->to() || edge->to() != edge->pair->from())) // Preserve existing seams (not just texture seams).
+                    {
+                        const HalfEdge::Face * neighborFace = edge->pair->face;
+                        nvDebugCheck(neighborFace != NULL);
+
+                        if (bitFlags.bitAt(neighborFace->id) == false)
+                        {
+                            queue.append(neighborFace->id);
+                            bitFlags.setBitAt(neighborFace->id);
+                        }
+                    }
+                }
+
+                first++;
+            }
+
+            Chart * chart = new Chart();
+            chart->build(m_mesh, queue);
+
+            m_chartArray.append(chart);
+        }
+    }
+}
+
+
+/*
+LSCM:
+- identify sharp features using local dihedral angles.
+- identify seed faces farthest from sharp features.
+- grow charts from these seeds.
+
+MCGIM:
+- phase 1: chart growth
+  - grow all charts simultaneously using dijkstra search on the dual graph of the mesh.
+  - graph edges are weighted based on planarity metric.
+  - metric uses distance to global chart normal.
+  - terminate when all faces have been assigned.
+- phase 2: seed computation:
+  - place new seed of the chart at the most interior face.
+  - most interior is evaluated using distance metric only.
+
+- method repeates the two phases, until the location of the seeds does not change.
+  - cycles are detected by recording all the previous seeds and chartification terminates.
+
+D-Charts:
+
+- Uniaxial conic metric:
+  - N_c = axis of the generalized cone that best fits the chart. (cone can a be cylinder or a plane).
+  - omega_c = angle between the face normals and the axis.
+  - Fitting error between chart C and tringle t: F(c,t) = (N_c*n_t - cos(omega_c))^2
+
+- Compactness metrics:
+  - Roundness:
+    - C(c,t) = pi * D(S_c,t)^2 / A_c
+    - S_c = chart seed.
+    - D(S_c,t) = length of the shortest path inside the chart betwen S_c and t.
+    - A_c = chart area.
+  - Straightness:
+    - P(c,t) = l_out(c,t) / l_in(c,t)
+    - l_out(c,t) = lenght of the edges not shared between C and t.
+    - l_in(c,t) = lenght of the edges shared between C and t.
+
+- Combined metric:
+  - Cost(c,t) = F(c,t)^alpha + C(c,t)^beta + P(c,t)^gamma
+  - alpha = 1, beta = 0.7, gamma = 0.5
+
+
+
+
+Our basic approach:
+- Just one iteration of k-means?
+- Avoid dijkstra by greedily growing charts until a threshold is met. Increase threshold and repeat until no faces left.
+- If distortion metric is too high, split chart, add two seeds.
+- If chart size is low, try removing chart.
+
+
+Postprocess:
+- If topology is not disk:
+  - Fill holes, if new faces fit proxy.
+  - Find best cut, otherwise.
+- After parameterization:
+  - If boundary self-intersects: 
+    - cut chart along the closest two diametral boundary vertices, repeat parametrization.
+    - what if the overlap is on an appendix? How do we find that out and cut appropiately?
+      - emphasize roundness metrics to prevent those cases.
+  - If interior self-overlaps: preserve boundary parameterization and use mean-value map.
+
+*/
+
+
+SegmentationSettings::SegmentationSettings()
+{
+    // Charts have no area or boundary limits right now.
+    maxChartArea = NV_FLOAT_MAX;
+    maxBoundaryLength = NV_FLOAT_MAX;
+
+    proxyFitMetricWeight = 1.0f;
+    roundnessMetricWeight = 0.1f;
+    straightnessMetricWeight = 0.25f;
+    normalSeamMetricWeight = 1.0f;
+    textureSeamMetricWeight = 0.1f;
+}
+
+
+
+void MeshCharts::computeCharts(const SegmentationSettings & settings, const Array<uint> & unchartedMaterialArray)
+{
+    Chart * vertexMap = NULL;
+    
+    if (unchartedMaterialArray.count() != 0) {
+        vertexMap = new Chart();
+        vertexMap->buildVertexMap(m_mesh, unchartedMaterialArray);
+
+        if (vertexMap->faceCount() == 0) {
+            delete vertexMap;
+            vertexMap = NULL;
+        }
+    }
+    
+
+    AtlasBuilder builder(m_mesh);
+
+    if (vertexMap != NULL) {
+        // Mark faces that do not need to be charted.
+        builder.markUnchartedFaces(vertexMap->faceArray());
+
+        m_chartArray.append(vertexMap);
+    }
+
+    if (builder.facesLeft != 0) {
+
+        // Tweak these values:
+        const float maxThreshold = 2;
+        const uint growFaceCount = 32;
+        const uint maxIterations = 4;
+        
+        builder.settings = settings;
+
+        //builder.settings.proxyFitMetricWeight *= 0.75; // relax proxy fit weight during initial seed placement.
+        //builder.settings.roundnessMetricWeight = 0;
+        //builder.settings.straightnessMetricWeight = 0;
+
+        // This seems a reasonable estimate.
+        uint maxSeedCount = max(6U, builder.facesLeft);
+
+        // Create initial charts greedely.
+        nvDebug("### Placing seeds\n");
+        builder.placeSeeds(maxThreshold, maxSeedCount);
+        nvDebug("###   Placed %d seeds (max = %d)\n", builder.chartCount(), maxSeedCount);
+
+        builder.updateProxies();
+
+        builder.mergeCharts();
+
+    #if 1
+        nvDebug("### Relocating seeds\n");
+        builder.relocateSeeds();
+
+        nvDebug("### Reset charts\n");
+        builder.resetCharts();
+
+        if (vertexMap != NULL) {
+            builder.markUnchartedFaces(vertexMap->faceArray());
+        }
+
+        builder.settings = settings;
+
+        nvDebug("### Growing charts\n");
+
+        // Restart process growing charts in parallel.
+        uint iteration = 0;
+        while (true)
+        {
+            if (!builder.growCharts(maxThreshold, growFaceCount))
+            {
+                nvDebug("### Can't grow anymore\n");
+
+                // If charts cannot grow more: fill holes, merge charts, relocate seeds and start new iteration.
+
+                nvDebug("### Filling holes\n");
+                builder.fillHoles(maxThreshold);
+                nvDebug("###   Using %d charts now\n", builder.chartCount());
+
+                builder.updateProxies();
+
+                nvDebug("### Merging charts\n");
+                builder.mergeCharts();
+                nvDebug("###   Using %d charts now\n", builder.chartCount());
+
+                nvDebug("### Reseeding\n");
+                if (!builder.relocateSeeds())
+                {
+                    nvDebug("### Cannot relocate seeds anymore\n");
+
+                    // Done!
+                    break;
+                }
+
+                if (iteration == maxIterations)
+                {
+                    nvDebug("### Reached iteration limit\n");
+                    break;
+                }
+                iteration++;
+
+                nvDebug("### Reset charts\n");
+                builder.resetCharts();
+
+                if (vertexMap != NULL) {
+                    builder.markUnchartedFaces(vertexMap->faceArray());
+                }
+
+                nvDebug("### Growing charts\n");
+            }
+        };
+    #endif
+
+        // Make sure no holes are left!
+        nvDebugCheck(builder.facesLeft == 0);
+
+        const uint chartCount = builder.chartArray.count();
+        for (uint i = 0; i < chartCount; i++)
+        {
+            Chart * chart = new Chart();
+            m_chartArray.append(chart);
+
+            chart->build(m_mesh, builder.chartFaces(i));
+        }
+    }
+
+
+    const uint chartCount = m_chartArray.count();
+
+    // Build face indices.
+    m_faceChart.resize(m_mesh->faceCount());
+    m_faceIndex.resize(m_mesh->faceCount());
+
+    for (uint i = 0; i < chartCount; i++)
+    {
+        const Chart * chart = m_chartArray[i];
+
+        const uint faceCount = chart->faceCount();
+        for (uint f = 0; f < faceCount; f++)
+        {
+            uint idx = chart->faceAt(f);
+            m_faceChart[idx] = i;
+            m_faceIndex[idx] = f;
+        }
+    }
+
+    // Build an exclusive prefix sum of the chart vertex counts.
+    m_chartVertexCountPrefixSum.resize(chartCount);
+    
+    if (chartCount > 0)
+    {
+        m_chartVertexCountPrefixSum[0] = 0;
+        
+        for (uint i = 1; i < chartCount; i++)
+        {
+            const Chart * chart = m_chartArray[i-1];
+            m_chartVertexCountPrefixSum[i] = m_chartVertexCountPrefixSum[i-1] + chart->vertexCount();
+        }
+
+        m_totalVertexCount = m_chartVertexCountPrefixSum[chartCount - 1] + m_chartArray[chartCount-1]->vertexCount();
+    }
+    else
+    {
+        m_totalVertexCount = 0;
+    }
+}
+
+
+void MeshCharts::parameterizeCharts()
+{
+    ParameterizationQuality globalParameterizationQuality;
+
+    // Parameterize the charts.
+    uint diskCount = 0;
+    const uint chartCount = m_chartArray.count();
+    for (uint i = 0; i < chartCount; i++)\
+    {
+        Chart * chart = m_chartArray[i];
+
+        bool isValid = false;
+
+        if (chart->isVertexMapped()) {
+            continue;
+        }
+
+        if (chart->isDisk())
+        {
+            diskCount++;
+
+            ParameterizationQuality chartParameterizationQuality;
+
+            if (chart->faceCount() == 1) {
+                computeSingleFaceMap(chart->unifiedMesh());
+
+                chartParameterizationQuality = ParameterizationQuality(chart->unifiedMesh());
+            }
+            else {
+                computeOrthogonalProjectionMap(chart->unifiedMesh());
+                ParameterizationQuality orthogonalQuality(chart->unifiedMesh());
+
+                computeLeastSquaresConformalMap(chart->unifiedMesh());
+                ParameterizationQuality lscmQuality(chart->unifiedMesh());
+                
+                // If the orthogonal projection produces better results, just use that.
+                // @@ It may be dangerous to do this, because isValid() does not detect self-overlaps.
+                // @@ Another problem is that with very thin patches with nearly zero parametric area, the results of our metric are not accurate.
+                /*if (orthogonalQuality.isValid() && orthogonalQuality.rmsStretchMetric() < lscmQuality.rmsStretchMetric()) {
+                    computeOrthogonalProjectionMap(chart->unifiedMesh());
+                    chartParameterizationQuality = orthogonalQuality;
+                }
+                else*/ {
+                    chartParameterizationQuality = lscmQuality;
+                }
+
+                // If conformal map failed, 
+
+                // @@ Experiment with other parameterization methods.
+                //computeCircularBoundaryMap(chart->unifiedMesh());
+                //computeConformalMap(chart->unifiedMesh());
+                //computeNaturalConformalMap(chart->unifiedMesh());
+                //computeGuidanceGradientMap(chart->unifiedMesh());
+            }
+
+            //ParameterizationQuality chartParameterizationQuality(chart->unifiedMesh());
+
+            isValid = chartParameterizationQuality.isValid();
+
+            if (!isValid)
+            {
+                nvDebug("*** Invalid parameterization.\n");
+#if 0
+                // Dump mesh to inspect problem:
+                static int pieceCount = 0;
+            
+                StringBuilder fileName;
+                fileName.format("invalid_chart_%d.obj", pieceCount++);
+                exportMesh(chart->unifiedMesh(), fileName.str()); 
+#endif
+            }
+
+            // @@ Check that parameterization quality is above a certain threshold.
+
+            // @@ Detect boundary self-intersections.
+
+            globalParameterizationQuality += chartParameterizationQuality;
+        }
+
+        if (!isValid)
+        {
+            //nvDebugBreak();
+            // @@ Run the builder again, but only on this chart.
+            //AtlasBuilder builder(chart->chartMesh());
+        }
+
+        // Transfer parameterization from unified mesh to chart mesh.
+        chart->transferParameterization();
+
+    }
+
+    nvDebug("  Parameterized %d/%d charts.\n", diskCount, chartCount);
+    nvDebug("  RMS stretch metric: %f\n", globalParameterizationQuality.rmsStretchMetric());
+    nvDebug("  MAX stretch metric: %f\n", globalParameterizationQuality.maxStretchMetric());
+    nvDebug("  RMS conformal metric: %f\n", globalParameterizationQuality.rmsConformalMetric());
+    nvDebug("  RMS authalic metric: %f\n", globalParameterizationQuality.maxAuthalicMetric());
+}
+
+
+
+Chart::Chart() : m_chartMesh(NULL), m_unifiedMesh(NULL), m_isDisk(false), m_isVertexMapped(false)
+{
+}
+
+void Chart::build(const HalfEdge::Mesh * originalMesh, const Array<uint> & faceArray)
+{
+    // Copy face indices.
+    m_faceArray = faceArray;
+
+    const uint meshVertexCount = originalMesh->vertexCount();
+
+    m_chartMesh = new HalfEdge::Mesh();
+    m_unifiedMesh = new HalfEdge::Mesh();
+
+    Array<uint> chartMeshIndices;
+    chartMeshIndices.resize(meshVertexCount, ~0);
+
+    Array<uint> unifiedMeshIndices;
+    unifiedMeshIndices.resize(meshVertexCount, ~0);
+
+    // Add vertices.
+    const uint faceCount = faceArray.count();
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = originalMesh->faceAt(faceArray[f]);
+        nvDebugCheck(face != NULL);
+
+        for(HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Vertex * vertex = it.current()->vertex;
+            const HalfEdge::Vertex * unifiedVertex = vertex->firstColocal();
+
+            if (unifiedMeshIndices[unifiedVertex->id] == ~0)
+            {
+                unifiedMeshIndices[unifiedVertex->id] = m_unifiedMesh->vertexCount();
+
+                nvDebugCheck(vertex->pos == unifiedVertex->pos);
+                m_unifiedMesh->addVertex(vertex->pos);
+            }
+
+            if (chartMeshIndices[vertex->id] == ~0)
+            {
+                chartMeshIndices[vertex->id] = m_chartMesh->vertexCount();
+                m_chartToOriginalMap.append(vertex->id);
+                m_chartToUnifiedMap.append(unifiedMeshIndices[unifiedVertex->id]);
+
+                HalfEdge::Vertex * v = m_chartMesh->addVertex(vertex->pos);
+                v->nor = vertex->nor;
+                v->tex = vertex->tex;
+            }
+        }
+    }
+
+    // This is ignoring the canonical map:
+    // - Is it really necessary to link colocals?
+
+    m_chartMesh->linkColocals();    
+    //m_unifiedMesh->linkColocals();  // Not strictly necessary, no colocals in the unified mesh. # Wrong.
+
+    // This check is not valid anymore, if the original mesh vertices were linked with a canonical map, then it might have
+    // some colocal vertices that were unlinked. So, the unified mesh might have some duplicate vertices, because firstColocal()
+    // is not guaranteed to return the same vertex for two colocal vertices.
+    //nvCheck(m_chartMesh->colocalVertexCount() == m_unifiedMesh->vertexCount());
+
+    // Is that OK? What happens in meshes were that happens? Does anything break? Apparently not...
+    
+
+
+    Array<uint> faceIndices(7);
+
+    // Add faces.
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = originalMesh->faceAt(faceArray[f]);
+        nvDebugCheck(face != NULL);
+
+        faceIndices.clear();
+
+        for(HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Vertex * vertex = it.current()->vertex;
+            nvDebugCheck(vertex != NULL);
+
+            faceIndices.append(chartMeshIndices[vertex->id]);
+        }
+
+        m_chartMesh->addFace(faceIndices);
+
+        faceIndices.clear();
+
+        for(HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Vertex * vertex = it.current()->vertex;
+            nvDebugCheck(vertex != NULL);
+
+            vertex = vertex->firstColocal();
+
+            faceIndices.append(unifiedMeshIndices[vertex->id]);
+        }
+
+        m_unifiedMesh->addFace(faceIndices);
+    }
+
+    m_chartMesh->linkBoundary();
+    m_unifiedMesh->linkBoundary();
+
+    //exportMesh(m_unifiedMesh.ptr(), "debug_input.obj");
+
+    if (m_unifiedMesh->splitBoundaryEdges()) {
+        m_unifiedMesh = unifyVertices(m_unifiedMesh.ptr());
+    }
+
+    //exportMesh(m_unifiedMesh.ptr(), "debug_split.obj");
+
+    // Closing the holes is not always the best solution and does not fix all the problems.
+    // We need to do some analysis of the holes and the genus to:
+    // - Find cuts that reduce genus.
+    // - Find cuts to connect holes.
+    // - Use minimal spanning trees or seamster.
+    if (!closeHoles()) {
+        /*static int pieceCount = 0;
+        StringBuilder fileName;
+        fileName.format("debug_hole_%d.obj", pieceCount++);
+        exportMesh(m_unifiedMesh.ptr(), fileName.str());*/
+    }
+
+    m_unifiedMesh = triangulate(m_unifiedMesh.ptr());
+    
+    //exportMesh(m_unifiedMesh.ptr(), "debug_triangulated.obj");
+
+
+    // Analyze chart topology.
+    MeshTopology topology(m_unifiedMesh.ptr());
+    m_isDisk = topology.isDisk();
+
+    // This is sometimes failing, when triangulate fails to add a triangle, it generates a hole in the mesh.
+    //nvDebugCheck(m_isDisk);
+
+    /*if (!m_isDisk) {
+        static int pieceCount = 0;
+        StringBuilder fileName;
+        fileName.format("debug_hole_%d.obj", pieceCount++);
+        exportMesh(m_unifiedMesh.ptr(), fileName.str());
+    }*/
+
+
+#if 0
+    if (!m_isDisk) {
+        nvDebugBreak();
+
+        static int pieceCount = 0;
+        
+        StringBuilder fileName;
+        fileName.format("debug_nodisk_%d.obj", pieceCount++);
+        exportMesh(m_chartMesh.ptr(), fileName.str()); 
+    }
+#endif
+
+}
+
+
+void Chart::buildVertexMap(const HalfEdge::Mesh * originalMesh, const Array<uint> & unchartedMaterialArray)
+{
+    nvCheck(m_chartMesh == NULL && m_unifiedMesh == NULL);
+
+    m_isVertexMapped = true;
+
+    // Build face indices.
+    m_faceArray.clear();
+
+    const uint meshFaceCount = originalMesh->faceCount();
+    for (uint f = 0; f < meshFaceCount; f++) {
+        const HalfEdge::Face * face = originalMesh->faceAt(f);
+
+        if (unchartedMaterialArray.contains(face->material)) {
+            m_faceArray.append(f);
+        }
+    }
+
+    const uint faceCount = m_faceArray.count();
+
+    if (faceCount == 0) {
+        return;
+    }
+
+
+    // @@ The chartMesh construction is basically the same as with regular charts, don't duplicate!
+
+    const uint meshVertexCount = originalMesh->vertexCount();
+
+    m_chartMesh = new HalfEdge::Mesh();
+
+    Array<uint> chartMeshIndices;
+    chartMeshIndices.resize(meshVertexCount, ~0);
+
+    // Vertex map mesh only has disconnected vertices.
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = originalMesh->faceAt(m_faceArray[f]);
+        nvDebugCheck(face != NULL);
+
+        for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Vertex * vertex = it.current()->vertex;
+
+            if (chartMeshIndices[vertex->id] == ~0)
+            {
+                chartMeshIndices[vertex->id] = m_chartMesh->vertexCount();
+                m_chartToOriginalMap.append(vertex->id);
+
+                HalfEdge::Vertex * v = m_chartMesh->addVertex(vertex->pos);
+                v->nor = vertex->nor;
+                v->tex = vertex->tex; // @@ Not necessary.
+            }
+        }
+    }
+
+    // @@ Link colocals using the original mesh canonical map? Build canonical map on the fly? Do we need to link colocals at all for this?
+    //m_chartMesh->linkColocals();
+
+    Array<uint> faceIndices(7);
+
+    // Add faces.
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = originalMesh->faceAt(m_faceArray[f]);
+        nvDebugCheck(face != NULL);
+
+        faceIndices.clear();
+
+        for(HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Vertex * vertex = it.current()->vertex;
+            nvDebugCheck(vertex != NULL);
+            nvDebugCheck(chartMeshIndices[vertex->id] != ~0);
+
+            faceIndices.append(chartMeshIndices[vertex->id]);
+        }
+
+        HalfEdge::Face * new_face = m_chartMesh->addFace(faceIndices);
+        nvDebugCheck(new_face != NULL);
+    }
+
+    m_chartMesh->linkBoundary();
+
+
+    const uint chartVertexCount = m_chartMesh->vertexCount();
+
+    Box bounds;
+    bounds.clearBounds();
+
+    for (uint i = 0; i < chartVertexCount; i++) {
+        HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(i);
+        bounds.addPointToBounds(vertex->pos);
+    }
+
+    ProximityGrid grid;
+    grid.init(bounds, chartVertexCount);
+
+    for (uint i = 0; i < chartVertexCount; i++) {
+        HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(i);
+        grid.add(vertex->pos, i);
+    }
+
+
+#if 0
+    // Arrange vertices in a rectangle.
+    vertexMapWidth = ftoi_ceil(sqrtf(float(chartVertexCount)));
+    vertexMapHeight = (chartVertexCount + vertexMapWidth - 1) / vertexMapWidth;
+    nvDebugCheck(vertexMapWidth >= vertexMapHeight);
+
+    int x = 0, y = 0;
+    for (uint i = 0; i < chartVertexCount; i++) {
+        HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(i);
+
+        vertex->tex.x = float(x);
+        vertex->tex.y = float(y);
+
+        x++;
+        if (x == vertexMapWidth) {
+            x = 0;
+            y++;
+            nvCheck(y < vertexMapHeight);
+        }
+    }
+
+#elif 0
+    // Arrange vertices in a rectangle, traversing grid in 3D morton order and laying them down in 2D morton order.
+    vertexMapWidth = ftoi_ceil(sqrtf(float(chartVertexCount)));
+    vertexMapHeight = (chartVertexCount + vertexMapWidth - 1) / vertexMapWidth;
+    nvDebugCheck(vertexMapWidth >= vertexMapHeight);
+
+    int n = 0;
+    uint32 texelCode = 0;
+
+    uint cellsVisited = 0;
+
+    const uint32 cellCodeCount = grid.mortonCount();
+    for (uint32 cellCode = 0; cellCode < cellCodeCount; cellCode++) {
+        int cell = grid.mortonIndex(cellCode);
+        if (cell < 0) continue;
+
+        cellsVisited++;
+
+        const Array<uint> & indexArray = grid.cellArray[cell].indexArray;
+
+        foreach(i, indexArray) {
+            uint idx = indexArray[i];
+            HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(idx);
+
+            //vertex->tex.x = float(n % rectangleWidth) + 0.5f;
+            //vertex->tex.y = float(n / rectangleWidth) + 0.5f;
+
+            // Lay down the points in z order too.
+            uint x, y;
+            do {
+                x = decodeMorton2X(texelCode);
+                y = decodeMorton2Y(texelCode);
+                texelCode++;
+            } while (x >= U32(vertexMapWidth) || y >= U32(vertexMapHeight));
+            
+            vertex->tex.x = float(x);
+            vertex->tex.y = float(y);
+
+            n++;
+        }
+    }
+
+    nvDebugCheck(cellsVisited == grid.cellArray.count());
+    nvDebugCheck(n == chartVertexCount);
+
+#else
+
+    uint texelCount = 0;
+
+    const float positionThreshold = 0.01f;
+    const float normalThreshold = 0.01f;
+
+    uint verticesVisited = 0;
+    uint cellsVisited = 0;
+
+    Array<int> vertexIndexArray;
+    vertexIndexArray.resize(chartVertexCount, -1); // Init all indices to -1.
+
+    // Traverse vertices in morton order. @@ It may be more interesting to sort them based on orientation.
+    const uint cellCodeCount = grid.mortonCount();
+    for (uint cellCode = 0; cellCode < cellCodeCount; cellCode++) {
+        int cell = grid.mortonIndex(cellCode);
+        if (cell < 0) continue;
+
+        cellsVisited++;
+
+        const Array<uint> & indexArray = grid.cellArray[cell].indexArray;
+
+        foreach(i, indexArray) {
+            uint idx = indexArray[i];
+            HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(idx);
+
+            nvDebugCheck(vertexIndexArray[idx] == -1);
+
+            Array<uint> neighbors;
+            grid.gather(vertex->pos, positionThreshold, /*ref*/neighbors);
+
+            // Compare against all nearby vertices, cluster greedily.
+            foreach(j, neighbors) {
+                uint otherIdx = neighbors[j];
+
+                if (vertexIndexArray[otherIdx] != -1) {
+                    HalfEdge::Vertex * otherVertex = m_chartMesh->vertexAt(otherIdx);
+
+                    if (distance(vertex->pos, otherVertex->pos) < positionThreshold &&
+                        distance(vertex->nor, otherVertex->nor) < normalThreshold) 
+                    {
+                        vertexIndexArray[idx] = vertexIndexArray[otherIdx];
+                        break;
+                    }
+                }
+            }
+
+            // If index not assigned, assign new one.
+            if (vertexIndexArray[idx] == -1) {
+                vertexIndexArray[idx] = texelCount++;
+            }
+
+            verticesVisited++;
+        }
+    }
+
+    nvDebugCheck(cellsVisited == grid.cellArray.count());
+    nvDebugCheck(verticesVisited == chartVertexCount);
+
+    vertexMapWidth = ftoi_ceil(sqrtf(float(texelCount)));
+    vertexMapWidth = (vertexMapWidth + 3) & ~3;                             // Width aligned to 4.
+    vertexMapHeight = vertexMapWidth == 0 ? 0 : (texelCount + vertexMapWidth - 1) / vertexMapWidth;
+    //vertexMapHeight = (vertexMapHeight + 3) & ~3;                           // Height aligned to 4.
+    nvDebugCheck(vertexMapWidth >= vertexMapHeight);
+
+    nvDebug("Reduced vertex count from %d to %d.\n", chartVertexCount, texelCount);
+
+#if 0
+    // This lays down the clustered vertices linearly.
+    for (uint i = 0; i < chartVertexCount; i++) {
+        HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(i);
+
+        int idx = vertexIndexArray[i];
+
+        vertex->tex.x = float(idx % vertexMapWidth);
+        vertex->tex.y = float(idx / vertexMapWidth);
+    }
+#else
+    // Lay down the clustered vertices in morton order.
+
+    Array<uint> texelCodes;
+    texelCodes.resize(texelCount);
+
+    // For each texel, assign one morton code.
+    uint texelCode = 0;
+    for (uint i = 0; i < texelCount; i++) {
+        uint x, y;
+        do {
+            x = decodeMorton2X(texelCode);
+            y = decodeMorton2Y(texelCode);
+            texelCode++;
+        } while (x >= U32(vertexMapWidth) || y >= U32(vertexMapHeight));
+
+        texelCodes[i] = texelCode - 1;
+    }
+
+    for (uint i = 0; i < chartVertexCount; i++) {
+        HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(i);
+
+        int idx = vertexIndexArray[i];
+        if (idx != -1) {
+            uint texelCode = texelCodes[idx];
+            uint x = decodeMorton2X(texelCode);
+            uint y = decodeMorton2Y(texelCode);
+
+            vertex->tex.x = float(x);
+            vertex->tex.y = float(y);
+        }
+    }
+
+#endif
+   
+#endif
+
+}
+
+
+
+static void getBoundaryEdges(HalfEdge::Mesh * mesh, Array<HalfEdge::Edge *> & boundaryEdges)
+{
+    nvDebugCheck(mesh != NULL);
+
+    const uint edgeCount = mesh->edgeCount();
+
+    BitArray bitFlags(edgeCount);
+    bitFlags.clearAll();
+
+    boundaryEdges.clear();
+
+    // Search for boundary edges. Mark all the edges that belong to the same boundary.
+    for (uint e = 0; e < edgeCount; e++)
+    {
+        HalfEdge::Edge * startEdge = mesh->edgeAt(e);
+
+        if (startEdge != NULL && startEdge->isBoundary() && bitFlags.bitAt(e) == false)
+        {
+            nvDebugCheck(startEdge->face != NULL);
+            nvDebugCheck(startEdge->pair->face == NULL);
+
+            startEdge = startEdge->pair;
+
+            const HalfEdge::Edge * edge = startEdge;
+            do {
+                nvDebugCheck(edge->face == NULL);
+                nvDebugCheck(bitFlags.bitAt(edge->id/2) == false);
+
+                bitFlags.setBitAt(edge->id / 2);
+                edge = edge->next;
+            } while(startEdge != edge);
+
+            boundaryEdges.append(startEdge);
+        }
+    }
+}
+
+
+bool Chart::closeLoop(uint start, const Array<HalfEdge::Edge *> & loop)
+{
+    const uint vertexCount = loop.count() - start;
+
+    nvDebugCheck(vertexCount >= 3);
+    if (vertexCount < 3) return false;
+
+    nvDebugCheck(loop[start]->vertex->isColocal(loop[start+vertexCount-1]->to()));
+
+    // If the hole is planar, then we add a single face that will be properly triangulated later.
+    // If the hole is not planar, we add a triangle fan with a vertex at the hole centroid.
+    // This is still a bit of a hack. There surely are better hole filling algorithms out there.
+
+    Array<Vector3> points;
+    points.resize(vertexCount);
+    for (uint i = 0; i < vertexCount; i++) {
+        points[i] = loop[start+i]->vertex->pos;
+    }
+
+    bool isPlanar = Fit::isPlanar(vertexCount, points.buffer());
+
+    if (isPlanar) {
+        // Add face and connect edges.
+        HalfEdge::Face * face = m_unifiedMesh->addFace();
+        for (uint i = 0; i < vertexCount; i++) {
+            HalfEdge::Edge * edge = loop[start + i];
+            
+            edge->face = face;
+            edge->setNext(loop[start + (i + 1) % vertexCount]);
+        }
+        face->edge = loop[start];
+
+        nvDebugCheck(face->isValid());
+    }
+    else {
+        // If the polygon is not planar, we just cross our fingers, and hope this will work:
+
+        // Compute boundary centroid:
+        Vector3 centroidPos(0);
+
+        for (uint i = 0; i < vertexCount; i++) {
+            centroidPos += points[i];
+        }
+
+        centroidPos *= (1.0f / vertexCount);
+
+        HalfEdge::Vertex * centroid = m_unifiedMesh->addVertex(centroidPos);
+
+        // Add one pair of edges for each boundary vertex.
+        for (uint j = vertexCount-1, i = 0; i < vertexCount; j = i++) {
+            HalfEdge::Face * face = m_unifiedMesh->addFace(centroid->id, loop[start+j]->vertex->id, loop[start+i]->vertex->id);
+            nvDebugCheck(face != NULL);
+        }
+    }
+
+    return true;
+}
+
+
+bool Chart::closeHoles()
+{
+    nvDebugCheck(!m_isVertexMapped);
+
+    Array<HalfEdge::Edge *> boundaryEdges;
+    getBoundaryEdges(m_unifiedMesh.ptr(), boundaryEdges);
+
+    uint boundaryCount = boundaryEdges.count();
+    if (boundaryCount <= 1)
+    {
+        // Nothing to close.
+        return true;
+    }
+
+    // Compute lengths and areas.
+    Array<float> boundaryLengths;
+    //Array<Vector3> boundaryCentroids;
+
+    for (uint i = 0; i < boundaryCount; i++)
+    {
+        const HalfEdge::Edge * startEdge = boundaryEdges[i];
+        nvCheck(startEdge->face == NULL);
+
+        //float boundaryEdgeCount = 0;
+        float boundaryLength = 0.0f;
+        //Vector3 boundaryCentroid(zero);
+
+        const HalfEdge::Edge * edge = startEdge;
+        do {
+            Vector3 t0 = edge->from()->pos;
+            Vector3 t1 = edge->to()->pos;
+
+            //boundaryEdgeCount++;
+            boundaryLength += length(t1 - t0);
+            //boundaryCentroid += edge->vertex()->pos;
+
+            edge = edge->next;
+        } while(edge != startEdge);
+
+        boundaryLengths.append(boundaryLength);
+        //boundaryCentroids.append(boundaryCentroid / boundaryEdgeCount);
+    }
+
+
+    // Find disk boundary.
+    uint diskBoundary = 0;
+    float maxLength = boundaryLengths[0];
+
+    for (uint i = 1; i < boundaryCount; i++)
+    {
+        if (boundaryLengths[i] > maxLength)
+        {
+            maxLength = boundaryLengths[i];
+            diskBoundary = i;
+        }
+    }
+
+
+    // Sew holes.
+    /*for (uint i = 0; i < boundaryCount; i++)
+    {
+        if (diskBoundary == i)
+        {
+            // Skip disk boundary.
+            continue;
+        }
+
+        HalfEdge::Edge * startEdge = boundaryEdges[i];
+        nvCheck(startEdge->face() == NULL);
+
+        boundaryEdges[i] = m_unifiedMesh->sewBoundary(startEdge);
+    }
+
+    exportMesh(m_unifiedMesh.ptr(), "debug_sewn.obj");*/
+
+    //bool hasNewHoles = false;
+
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    // @@ Close loop is wrong, after closing a loop, we do not only have to add the face, but make sure that every edge in he loop is pointing to the right place.
+
+    // Close holes.
+    for (uint i = 0; i < boundaryCount; i++)
+    {
+        if (diskBoundary == i)
+        {
+            // Skip disk boundary.
+            continue;
+        }
+
+        HalfEdge::Edge * startEdge = boundaryEdges[i];
+        nvDebugCheck(startEdge != NULL);
+        nvDebugCheck(startEdge->face == NULL);
+
+#if 1
+        Array<HalfEdge::Vertex *> vertexLoop;
+        Array<HalfEdge::Edge *> edgeLoop;
+
+        HalfEdge::Edge * edge = startEdge;
+        do {
+            HalfEdge::Vertex * vertex = edge->next->vertex; // edge->to()
+
+            uint i;
+            for (i = 0; i < vertexLoop.count(); i++) {
+                if (vertex->isColocal(vertexLoop[i])) {
+                    break;
+                }
+            }
+            
+            bool isCrossing = (i != vertexLoop.count());
+
+            if (isCrossing) {
+
+                HalfEdge::Edge * prev = edgeLoop[i];    // Previous edge before the loop.
+                HalfEdge::Edge * next = edge->next;   // Next edge after the loop.
+
+                nvDebugCheck(prev->to()->isColocal(next->from()));
+
+                // Close loop.
+                edgeLoop.append(edge);
+                closeLoop(i+1, edgeLoop);
+
+                // Link boundary loop.
+                prev->setNext(next);
+                vertex->setEdge(next);
+
+                // Start over again.
+                vertexLoop.clear();
+                edgeLoop.clear();
+                
+                edge = startEdge;
+                vertex = edge->to();
+            }
+
+            vertexLoop.append(vertex);
+            edgeLoop.append(edge);
+
+            edge = edge->next;
+        } while(edge != startEdge);
+
+        closeLoop(0, edgeLoop);
+#endif
+
+        /*
+
+        // Add face and connect boundary edges.
+        HalfEdge::Face * face = m_unifiedMesh->addFace();
+        face->setEdge(startEdge);
+
+        HalfEdge::Edge * edge = startEdge;
+        do {
+            edge->setFace(face);
+
+            edge = edge->next();
+        } while(edge != startEdge);
+
+        */
+
+
+        /*
+        uint edgeCount = 0;
+        HalfEdge::Edge * edge = startEdge;
+        do {
+            edgeCount++;
+            edge = edge->next();
+        } while(edge != startEdge);
+
+
+
+        // Count edges in this boundary.
+        uint edgeCount = 0;
+        HalfEdge::Edge * edge = startEdge;
+        do {
+            edgeCount++;
+            edge = edge->next();
+        } while(edge != startEdge);
+
+        // Trivial hole, fill with one triangle. This actually works for all convex boundaries with non colinear vertices.
+        if (edgeCount == 3) {
+            // Add face and connect boundary edges.
+            HalfEdge::Face * face = m_unifiedMesh->addFace();
+            face->setEdge(startEdge);
+
+            edge = startEdge;
+            do {
+                edge->setFace(face);
+
+                edge = edge->next();
+            } while(edge != startEdge);
+
+            // @@ Implement the above using addFace, it should now work with existing edges, as long as their face pointers is zero.
+
+        }
+        else {
+            // Ideally we should:
+            // - compute best fit plane of boundary vertices.
+            // - project boundary polygon onto plane.
+            // - triangulate boundary polygon.
+            // - add faces of the resulting triangulation.
+
+            // I don't have a good triangulator available. A more simple solution that works in more (but not all) cases:
+            // - compute boundary centroid.
+            // - add vertex centroid.
+            // - connect centroid vertex with boundary vertices.
+            // - connect radial edges with boundary edges.
+
+            // This should work for non-convex boundaries with colinear vertices as long as the kernel of the polygon is not empty.
+
+            // Compute boundary centroid:
+            Vector3 centroid_pos(0);
+            Vector2 centroid_tex(0);
+
+            HalfEdge::Edge * edge = startEdge;
+            do {
+                centroid_pos += edge->vertex()->pos;
+                centroid_tex += edge->vertex()->tex;
+                edge = edge->next();
+            } while(edge != startEdge);
+
+            centroid_pos *= (1.0f / edgeCount);
+            centroid_tex *= (1.0f / edgeCount);
+
+            HalfEdge::Vertex * centroid = m_unifiedMesh->addVertex(centroid_pos);
+            centroid->tex = centroid_tex;
+
+            // Add one pair of edges for each boundary vertex.
+            edge = startEdge;
+            do {
+                HalfEdge::Edge * next = edge->next();
+
+                nvCheck(edge->face() == NULL);
+                HalfEdge::Face * face = m_unifiedMesh->addFace(centroid->id(), edge->from()->id(), edge->to()->id());
+                
+                if (face != NULL) {
+                    nvCheck(edge->face() == face);
+                }
+                else {
+                    hasNewHoles = true;
+                }
+
+                edge = next;
+            } while(edge != startEdge);
+        }
+        */
+    }
+
+    /*nvDebugCheck(!hasNewHoles);
+
+    if (hasNewHoles) {
+        // Link boundary again, in case closeHoles created new holes!
+        m_unifiedMesh->linkBoundary();
+    }*/
+
+    // Because some algorithms do not expect sparse edge buffers.
+    //m_unifiedMesh->compactEdges();
+
+    // In case we messed up:
+    //m_unifiedMesh->linkBoundary();
+
+    getBoundaryEdges(m_unifiedMesh.ptr(), boundaryEdges);
+
+    boundaryCount = boundaryEdges.count();
+    nvDebugCheck(boundaryCount == 1);
+
+    //exportMesh(m_unifiedMesh.ptr(), "debug_hole_filled.obj");
+
+    return boundaryCount == 1;
+}
+
+
+// Transfer parameterization from unified mesh to chart mesh.
+void Chart::transferParameterization() {
+    nvDebugCheck(!m_isVertexMapped);
+
+    uint vertexCount = m_chartMesh->vertexCount();
+    for (uint v = 0; v < vertexCount; v++) {
+        HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(v);
+        HalfEdge::Vertex * unifiedVertex = m_unifiedMesh->vertexAt(mapChartVertexToUnifiedVertex(v));
+        vertex->tex = unifiedVertex->tex;
+    }
+}
+
+float Chart::computeSurfaceArea() const {
+    return nv::computeSurfaceArea(m_chartMesh.ptr()) * scale;
+}
+
+float Chart::computeParametricArea() const {
+    // This only makes sense in parameterized meshes.
+    nvDebugCheck(m_isDisk);            
+    nvDebugCheck(!m_isVertexMapped);
+
+    return nv::computeParametricArea(m_chartMesh.ptr());
+}
+
+Vector2 Chart::computeParametricBounds() const {
+    // This only makes sense in parameterized meshes.
+    nvDebugCheck(m_isDisk);
+    nvDebugCheck(!m_isVertexMapped);
+
+    Box bounds;
+    bounds.clearBounds();
+
+    uint vertexCount = m_chartMesh->vertexCount();
+    for (uint v = 0; v < vertexCount; v++) {
+        HalfEdge::Vertex * vertex = m_chartMesh->vertexAt(v);
+        bounds.addPointToBounds(Vector3(vertex->tex, 0));
+    }
+
+    return bounds.extents().xy();
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/param/Atlas.h b/thirdparty/thekla_atlas/nvmesh/param/Atlas.h
new file mode 100644
index 0000000000..41cfaea9cb
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/Atlas.h
@@ -0,0 +1,186 @@
+// Copyright NVIDIA Corporation 2006 -- Ignacio Castano <icastano@nvidia.com>
+
+#pragma once
+#ifndef NV_MESH_ATLAS_H
+#define NV_MESH_ATLAS_H
+
+#include "nvcore/Array.h"
+#include "nvcore/Ptr.h"
+#include "nvmath/Vector.h"
+#include "nvmesh/nvmesh.h"
+#include "nvmesh/halfedge/Mesh.h"
+
+
+namespace nv
+{
+    namespace HalfEdge { class Mesh; }
+
+    class Chart;
+    class MeshCharts;
+    class VertexMap;
+
+    struct SegmentationSettings
+    {
+        SegmentationSettings();
+
+        float maxChartArea;
+        float maxBoundaryLength;
+
+        float proxyFitMetricWeight;
+        float roundnessMetricWeight;
+        float straightnessMetricWeight;
+        float normalSeamMetricWeight;
+        float textureSeamMetricWeight;
+    };
+
+
+    /// An atlas is a set of charts.
+    class Atlas
+    {
+    public:
+
+        Atlas();
+        ~Atlas();
+
+        uint meshCount() const { return m_meshChartsArray.count(); }
+        const MeshCharts * meshAt(uint i) const { return m_meshChartsArray[i]; }
+        MeshCharts * meshAt(uint i) { return m_meshChartsArray[i]; }
+
+        uint chartCount() const;
+        const Chart * chartAt(uint i) const;
+        Chart * chartAt(uint i);
+
+        // Add mesh charts and takes ownership.
+        void addMeshCharts(MeshCharts * meshCharts);
+
+        void extractCharts(const HalfEdge::Mesh * mesh);
+        void computeCharts(const HalfEdge::Mesh * mesh, const SegmentationSettings & settings, const Array<uint> & unchartedMaterialArray);
+
+
+        // Compute a trivial seamless texture similar to ZBrush.
+        //bool computeSeamlessTextureAtlas(bool groupFaces = true, bool scaleTiles = false, uint w = 1024, uint h = 1024);
+
+        void parameterizeCharts();
+
+        // Pack charts in the smallest possible rectangle.
+        float packCharts(int quality, float texelArea, bool blockAlign, bool conservative);
+        void setFailed() { failed = true; }
+        bool hasFailed() const { return failed; }
+
+    private:
+
+        bool failed;
+        Array<MeshCharts *> m_meshChartsArray;
+
+    };
+
+
+    // Set of charts corresponding to a single mesh.
+    class MeshCharts
+    {
+    public:
+        MeshCharts(const HalfEdge::Mesh * mesh);
+        ~MeshCharts();
+
+        uint chartCount() const { return m_chartArray.count(); }
+        uint vertexCount () const { return m_totalVertexCount; }
+
+        const Chart * chartAt(uint i) const { return m_chartArray[i]; }
+        Chart * chartAt(uint i) { return m_chartArray[i]; }
+
+        void computeVertexMap(const Array<uint> & unchartedMaterialArray);
+
+        // Extract the charts of the input mesh.
+        void extractCharts();
+
+        // Compute charts using a simple segmentation algorithm.
+        void computeCharts(const SegmentationSettings & settings, const Array<uint> & unchartedMaterialArray);
+
+        void parameterizeCharts();
+
+        uint faceChartAt(uint i) const { return m_faceChart[i]; }
+        uint faceIndexWithinChartAt(uint i) const { return m_faceIndex[i]; }
+
+        uint vertexCountBeforeChartAt(uint i) const { return m_chartVertexCountPrefixSum[i]; }
+
+    private:
+
+        const HalfEdge::Mesh * m_mesh;
+
+        Array<Chart *> m_chartArray;
+        
+        Array<uint> m_chartVertexCountPrefixSum;
+        uint m_totalVertexCount;
+
+        Array<uint> m_faceChart; // the chart of every face of the input mesh.
+        Array<uint> m_faceIndex; // the index within the chart for every face of the input mesh.
+    };
+
+
+    /// A chart is a connected set of faces with a certain topology (usually a disk).
+    class Chart
+    {
+    public:
+
+        Chart();
+
+        void build(const HalfEdge::Mesh * originalMesh, const Array<uint> & faceArray);
+        void buildVertexMap(const HalfEdge::Mesh * originalMesh, const Array<uint> & unchartedMaterialArray);
+
+        bool closeHoles();
+
+        bool isDisk() const { return m_isDisk; }
+        bool isVertexMapped() const { return m_isVertexMapped; }
+
+        uint vertexCount() const { return m_chartMesh->vertexCount(); }
+        uint colocalVertexCount() const { return m_unifiedMesh->vertexCount(); }
+
+        uint faceCount() const { return m_faceArray.count(); }
+        uint faceAt(uint i) const { return m_faceArray[i]; }
+
+        const HalfEdge::Mesh * chartMesh() const { return m_chartMesh.ptr(); }
+        HalfEdge::Mesh * chartMesh() { return m_chartMesh.ptr(); }
+        const HalfEdge::Mesh * unifiedMesh() const { return m_unifiedMesh.ptr(); }
+        HalfEdge::Mesh * unifiedMesh() { return m_unifiedMesh.ptr(); }
+
+        //uint vertexIndex(uint i) const { return m_vertexIndexArray[i]; }
+
+        uint mapChartVertexToOriginalVertex(uint i) const { return m_chartToOriginalMap[i]; }
+        uint mapChartVertexToUnifiedVertex(uint i) const { return m_chartToUnifiedMap[i]; }
+
+        const Array<uint> & faceArray() const { return m_faceArray; }
+
+        void transferParameterization();
+
+        float computeSurfaceArea() const;
+        float computeParametricArea() const;
+        Vector2 computeParametricBounds() const;
+
+
+        float scale = 1.0f;
+        uint vertexMapWidth;
+        uint vertexMapHeight;
+
+    private:
+
+        bool closeLoop(uint start, const Array<HalfEdge::Edge *> & loop);
+
+        // Chart mesh.
+        AutoPtr<HalfEdge::Mesh> m_chartMesh;
+        AutoPtr<HalfEdge::Mesh> m_unifiedMesh;
+
+        bool m_isDisk;
+        bool m_isVertexMapped;
+
+        // List of faces of the original mesh that belong to this chart.
+        Array<uint> m_faceArray;
+
+        // Map vertices of the chart mesh to vertices of the original mesh.
+        Array<uint> m_chartToOriginalMap;
+
+        Array<uint> m_chartToUnifiedMap;
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_ATLAS_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/AtlasBuilder.cpp b/thirdparty/thekla_atlas/nvmesh/param/AtlasBuilder.cpp
new file mode 100644
index 0000000000..bd2140c2f3
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/AtlasBuilder.cpp
@@ -0,0 +1,1320 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "nvmesh.h" // pch
+
+#include "AtlasBuilder.h"
+#include "Util.h"
+
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Face.h"
+#include "nvmesh/halfedge/Vertex.h"
+
+#include "nvmath/Matrix.inl"
+#include "nvmath/Vector.inl"
+
+//#include "nvcore/IntroSort.h"
+#include "nvcore/Array.inl"
+
+#include <algorithm> // std::sort
+
+#include <float.h> // FLT_MAX
+#include <limits.h> // UINT_MAX
+
+using namespace nv;
+
+namespace
+{
+
+    // Dummy implementation of a priority queue using sort at insertion.
+    // - Insertion is o(n)
+    // - Smallest element goes at the end, so that popping it is o(1).
+    // - Resorting is n*log(n)
+    // @@ Number of elements in the queue is usually small, and we'd have to rebalance often. I'm not sure it's worth implementing a heap.
+    // @@ Searcing at removal would remove the need for sorting when priorities change.
+    struct PriorityQueue
+    {
+        PriorityQueue(uint size = UINT_MAX) : maxSize(size) {}
+
+        void push(float priority, uint face) {
+            uint i = 0;
+            const uint count = pairs.count();
+            for (; i < count; i++) {
+                if (pairs[i].priority > priority) break;
+            }
+
+            Pair p = { priority, face };
+            pairs.insertAt(i, p);
+
+            if (pairs.count() > maxSize) {
+                pairs.removeAt(0);
+            }
+        }
+
+        // push face out of order, to be sorted later.
+        void push(uint face) {
+            Pair p = { 0.0f, face };
+            pairs.append(p);
+        }
+
+        uint pop() {
+            uint f = pairs.back().face;
+            pairs.pop_back();
+            return f;
+        }
+
+        void sort() {
+            //nv::sort(pairs); // @@ My intro sort appears to be much slower than it should!
+            std::sort(pairs.buffer(), pairs.buffer() + pairs.count());
+        }
+
+        void clear() {
+            pairs.clear();
+        }
+
+        uint count() const { return pairs.count(); }
+
+        float firstPriority() const { return pairs.back().priority; }
+
+
+        const uint maxSize;
+        
+        struct Pair {
+            bool operator <(const Pair & p) const { return priority > p.priority; } // !! Sort in inverse priority order!
+            float priority;
+            uint face;
+        };
+        
+
+        Array<Pair> pairs;
+    };
+
+    static bool isNormalSeam(const HalfEdge::Edge * edge) {
+        return (edge->vertex->nor != edge->pair->next->vertex->nor || edge->next->vertex->nor != edge->pair->vertex->nor);
+    }
+
+    static bool isTextureSeam(const HalfEdge::Edge * edge) {
+        return (edge->vertex->tex != edge->pair->next->vertex->tex || edge->next->vertex->tex != edge->pair->vertex->tex);
+    }
+
+} // namespace
+
+
+struct nv::ChartBuildData
+{
+    ChartBuildData(int id) : id(id) {
+        planeNormal = Vector3(0);
+        centroid = Vector3(0);
+        coneAxis = Vector3(0);
+        coneAngle = 0;
+        area = 0;
+        boundaryLength = 0;
+        normalSum = Vector3(0);
+        centroidSum = Vector3(0);
+    }
+
+    int id;
+
+    // Proxy info:
+    Vector3 planeNormal;
+    Vector3 centroid;
+    Vector3 coneAxis;
+    float coneAngle;
+    
+    float area;
+    float boundaryLength;
+    Vector3 normalSum;
+    Vector3 centroidSum;
+    
+    Array<uint> seeds;  // @@ These could be a pointers to the HalfEdge faces directly.
+	Array<uint> faces;
+    PriorityQueue candidates;
+};
+
+
+
+AtlasBuilder::AtlasBuilder(const HalfEdge::Mesh * m) : mesh(m), facesLeft(m->faceCount())
+{
+    const uint faceCount = m->faceCount();
+    faceChartArray.resize(faceCount, -1);
+    faceCandidateArray.resize(faceCount, -1);
+
+    // @@ Floyd for the whole mesh is too slow. We could compute floyd progressively per patch as the patch grows. We need a better solution to compute most central faces.
+    //computeShortestPaths();
+
+    // Precompute edge lengths and face areas.
+    uint edgeCount = m->edgeCount();
+    edgeLengths.resize(edgeCount);
+
+    for (uint i = 0; i < edgeCount; i++) {
+        uint id = m->edgeAt(i)->id;
+        nvDebugCheck(id / 2 == i);
+
+        edgeLengths[i] = m->edgeAt(i)->length();
+    }
+
+    faceAreas.resize(faceCount);
+    for (uint i = 0; i < faceCount; i++) {
+        faceAreas[i] = m->faceAt(i)->area();
+    }
+}
+
+AtlasBuilder::~AtlasBuilder()
+{
+    const uint chartCount = chartArray.count();
+    for (uint i = 0; i < chartCount; i++)
+    {
+        delete chartArray[i];
+    }
+}
+
+
+void AtlasBuilder::markUnchartedFaces(const Array<uint> & unchartedFaces)
+{
+    const uint unchartedFaceCount = unchartedFaces.count();
+    for (uint i = 0; i < unchartedFaceCount; i++){ 
+        uint f = unchartedFaces[i];
+        faceChartArray[f] = -2;
+        //faceCandidateArray[f] = -2; // @@ ?
+
+        removeCandidate(f);
+    }
+
+    nvDebugCheck(facesLeft >= unchartedFaceCount);
+    facesLeft -= unchartedFaceCount;
+}
+
+
+void AtlasBuilder::computeShortestPaths()
+{
+    const uint faceCount = mesh->faceCount();
+    shortestPaths.resize(faceCount*faceCount, FLT_MAX);
+
+    // Fill edges:
+    for (uint i = 0; i < faceCount; i++)
+    {
+        shortestPaths[i*faceCount + i] = 0.0f;
+
+        const HalfEdge::Face * face_i = mesh->faceAt(i);
+        Vector3 centroid_i = face_i->centroid();
+
+        for (HalfEdge::Face::ConstEdgeIterator it(face_i->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Edge * edge = it.current();
+
+            if (!edge->isBoundary())
+            {
+                const HalfEdge::Face * face_j = edge->pair->face;
+
+                uint j = face_j->id;
+                Vector3 centroid_j = face_j->centroid();
+
+                shortestPaths[i*faceCount + j] = shortestPaths[j*faceCount + i] = length(centroid_i - centroid_j);
+            }
+        }
+    }
+
+    // Use Floyd-Warshall algorithm to compute all paths:
+    for (uint k = 0; k < faceCount; k++)
+    {
+        for (uint i = 0; i < faceCount; i++)
+        {
+            for (uint j = 0; j < faceCount; j++)
+            {
+                shortestPaths[i*faceCount + j] = min(shortestPaths[i*faceCount + j], shortestPaths[i*faceCount + k]+shortestPaths[k*faceCount + j]);
+            }
+        }
+    }
+}
+
+
+void AtlasBuilder::placeSeeds(float threshold, uint maxSeedCount)
+{
+    // Instead of using a predefiened number of seeds:
+    // - Add seeds one by one, growing chart until a certain treshold.
+    // - Undo charts and restart growing process.
+
+    // @@ How can we give preference to faces far from sharp features as in the LSCM paper?
+    //   - those points can be found using a simple flood filling algorithm.
+    //   - how do we weight the probabilities?
+
+    for (uint i = 0; i < maxSeedCount; i++)
+    {
+        if (facesLeft == 0) {
+            // No faces left, stop creating seeds.
+            break;
+        }
+
+        createRandomChart(threshold);
+    }
+}
+
+
+void AtlasBuilder::createRandomChart(float threshold)
+{
+    ChartBuildData * chart = new ChartBuildData(chartArray.count());
+    chartArray.append(chart);
+
+    // Pick random face that is not used by any chart yet.
+    uint randomFaceIdx = rand.getRange(facesLeft - 1);
+    uint i = 0;
+    for (uint f = 0; f != randomFaceIdx; f++, i++)
+    {
+        while (faceChartArray[i] != -1) i++;
+    }
+    while (faceChartArray[i] != -1) i++;
+
+    chart->seeds.append(i);
+
+    addFaceToChart(chart, i, true);
+
+    // Grow the chart as much as possible within the given threshold.
+    growChart(chart, threshold * 0.5f, facesLeft);
+    //growCharts(threshold - threshold * 0.75f / chartCount(), facesLeft);
+}
+
+void AtlasBuilder::addFaceToChart(ChartBuildData * chart, uint f, bool recomputeProxy)
+{
+    // Add face to chart.
+    chart->faces.append(f);
+
+    nvDebugCheck(faceChartArray[f] == -1);
+    faceChartArray[f] = chart->id;
+
+    facesLeft--;
+
+    // Update area and boundary length.
+    chart->area = evaluateChartArea(chart, f);
+    chart->boundaryLength = evaluateBoundaryLength(chart, f);
+    chart->normalSum = evaluateChartNormalSum(chart, f);
+    chart->centroidSum = evaluateChartCentroidSum(chart, f);
+
+    if (recomputeProxy) {
+        // Update proxy and candidate's priorities.
+        updateProxy(chart);
+    }
+
+    // Update candidates.
+    removeCandidate(f);
+    updateCandidates(chart, f);
+    updatePriorities(chart);
+}
+
+// @@ Get N best candidates in one pass.
+const AtlasBuilder::Candidate & AtlasBuilder::getBestCandidate() const
+{
+    uint best = 0;
+    float bestCandidateMetric = FLT_MAX;
+
+    const uint candidateCount = candidateArray.count();
+    nvCheck(candidateCount > 0);
+
+    for (uint i = 0; i < candidateCount; i++)
+    {
+        const Candidate & candidate = candidateArray[i];
+    
+        if (candidate.metric < bestCandidateMetric) {
+            bestCandidateMetric = candidate.metric;
+            best = i;
+        }
+    }
+
+    return candidateArray[best];
+}
+
+
+// Returns true if any of the charts can grow more.
+bool AtlasBuilder::growCharts(float threshold, uint faceCount)
+{
+#if 1 // Using one global list.
+
+    faceCount = min(faceCount, facesLeft);
+
+    for (uint i = 0; i < faceCount; i++)
+    {
+        const Candidate & candidate = getBestCandidate();
+        
+        if (candidate.metric > threshold) {
+            return false; // Can't grow more.
+        }
+
+        addFaceToChart(candidate.chart, candidate.face);
+    }
+
+    return facesLeft != 0; // Can continue growing.
+
+#else // Using one list per chart.
+    bool canGrowMore = false;
+
+    const uint chartCount = chartArray.count();
+    for (uint i = 0; i < chartCount; i++)
+    {
+        if (growChart(chartArray[i], threshold, faceCount))
+        {
+            canGrowMore = true;
+        }
+    }
+
+    return canGrowMore;
+#endif
+}
+
+bool AtlasBuilder::growChart(ChartBuildData * chart, float threshold, uint faceCount)
+{
+    // Try to add faceCount faces within threshold to chart.
+    for (uint i = 0; i < faceCount; )
+    {
+        if (chart->candidates.count() == 0 || chart->candidates.firstPriority() > threshold)
+        {
+            return false;
+        }
+
+        uint f = chart->candidates.pop();
+        if (faceChartArray[f] == -1)
+        {
+            addFaceToChart(chart, f);
+            i++;
+        }
+    }
+
+    if (chart->candidates.count() == 0 || chart->candidates.firstPriority() > threshold)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+
+void AtlasBuilder::resetCharts()
+{
+    const uint faceCount = mesh->faceCount();
+    for (uint i = 0; i < faceCount; i++)
+    {
+        faceChartArray[i] = -1;
+        faceCandidateArray[i] = -1;
+    }
+
+    facesLeft = faceCount;
+
+    candidateArray.clear();
+
+    const uint chartCount = chartArray.count();
+    for (uint i = 0; i < chartCount; i++)
+    {
+        ChartBuildData * chart = chartArray[i];
+
+        const uint seed = chart->seeds.back();
+
+        chart->area = 0.0f;
+        chart->boundaryLength = 0.0f;
+        chart->normalSum = Vector3(0);
+        chart->centroidSum = Vector3(0);
+
+        chart->faces.clear();
+        chart->candidates.clear();
+
+        addFaceToChart(chart, seed);
+    }
+}
+
+
+void AtlasBuilder::updateCandidates(ChartBuildData * chart, uint f)
+{
+    const HalfEdge::Face * face = mesh->faceAt(f);
+
+    // Traverse neighboring faces, add the ones that do not belong to any chart yet.
+    for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Edge * edge = it.current()->pair;
+
+        if (!edge->isBoundary())
+        {
+            uint f = edge->face->id;
+
+            if (faceChartArray[f] == -1)
+            {
+                chart->candidates.push(f);
+            }
+        }
+    }
+}
+
+
+void AtlasBuilder::updateProxies()
+{
+    const uint chartCount = chartArray.count();
+    for (uint i = 0; i < chartCount; i++)
+    {
+        updateProxy(chartArray[i]);
+    }
+}
+
+
+namespace {
+
+    float absoluteSum(Vector4::Arg v)
+    {
+        return fabs(v.x) + fabs(v.y) + fabs(v.z) + fabs(v.w);
+    }
+
+    //#pragma message(NV_FILE_LINE "FIXME: Using the c=cos(teta) substitution, the equation system becomes linear and we can avoid the newton solver.")
+
+    struct ConeFitting
+    {
+        ConeFitting(const HalfEdge::Mesh * m, float g, float tf, float tx) : mesh(m), gamma(g), tolf(tf), tolx(tx), F(0), D(0), H(0) {
+        }
+
+        void addTerm(Vector3 N, float A)
+        {
+            const float c = cosf(X.w);
+            const float s = sinf(X.w);
+            const float tmp = dot(X.xyz(), N) - c;
+
+            F += tmp * tmp;
+
+            D.x += 2 * X.x * tmp;
+            D.y += 2 * X.y * tmp;
+            D.z += 2 * X.z * tmp;
+            D.w += 2 * s * tmp;
+
+            H(0,0) = 2 * X.x * N.x + 2 * tmp;
+            H(0,1) = 2 * X.x * N.y;
+            H(0,2) = 2 * X.x * N.z;
+            H(0,3) = 2 * X.x * s;
+
+            H(1,0) = 2 * X.y * N.x;
+            H(1,1) = 2 * X.y * N.y + 2 * tmp;
+            H(1,2) = 2 * X.y * N.z;
+            H(1,3) = 2 * X.y * s;
+
+            H(2,0) = 2 * X.z * N.x;
+            H(2,1) = 2 * X.z * N.y;
+            H(2,2) = 2 * X.z * N.z + 2 * tmp;
+            H(2,3) = 2 * X.z * s;
+
+            H(3,0) = 2 * s * N.x;
+            H(3,1) = 2 * s * N.y;
+            H(3,2) = 2 * s * N.z;
+            H(3,3) = 2 * s * s + 2 * c * tmp;
+        }
+
+        Vector4 solve(ChartBuildData * chart, Vector4 start)
+        {
+            const uint faceCount = chart->faces.count();
+
+            X = start;
+            
+            Vector4 dX;
+
+            do {
+                for (uint i = 0; i < faceCount; i++)
+                {
+                    const HalfEdge::Face * face = mesh->faceAt(chart->faces[i]);
+
+                    addTerm(face->normal(), face->area());
+                }
+
+                Vector4 dX;
+                //solveKramer(H, D, &dX);
+                solveLU(H, D, &dX);
+
+                // @@ Do a full newton step and reduce by half if F doesn't decrease.
+                X -= gamma * dX;
+
+                // Constrain normal to be normalized.
+                X = Vector4(normalize(X.xyz()), X.w);
+                
+            } while(absoluteSum(D) > tolf || absoluteSum(dX) > tolx);
+
+            return X;
+        }
+
+        HalfEdge::Mesh const * const mesh;
+        const float gamma;
+        const float tolf;
+        const float tolx;
+
+        Vector4 X;
+
+        float F;
+        Vector4 D;
+        Matrix H;
+    };
+
+    // Unnormalized face normal assuming it's a triangle.
+    static Vector3 triangleNormal(const HalfEdge::Face * face)
+    {
+        Vector3 p0 = face->edge->vertex->pos;
+        Vector3 p1 = face->edge->next->vertex->pos;
+        Vector3 p2 = face->edge->next->next->vertex->pos;
+
+        Vector3 e0 = p2 - p0;
+        Vector3 e1 = p1 - p0;
+
+        return normalizeSafe(cross(e0, e1), Vector3(0), 0.0f);
+    }
+
+    static Vector3 triangleNormalAreaScaled(const HalfEdge::Face * face)
+    {
+        Vector3 p0 = face->edge->vertex->pos;
+        Vector3 p1 = face->edge->next->vertex->pos;
+        Vector3 p2 = face->edge->next->next->vertex->pos;
+
+        Vector3 e0 = p2 - p0;
+        Vector3 e1 = p1 - p0;
+
+        return cross(e0, e1);
+    }
+
+    // Average of the edge midpoints weighted by the edge length.
+    // I want a point inside the triangle, but closer to the cirumcenter.
+    static Vector3 triangleCenter(const HalfEdge::Face * face)
+    {
+        Vector3 p0 = face->edge->vertex->pos;
+        Vector3 p1 = face->edge->next->vertex->pos;
+        Vector3 p2 = face->edge->next->next->vertex->pos;
+
+        float l0 = length(p1 - p0);
+        float l1 = length(p2 - p1);
+        float l2 = length(p0 - p2);
+
+        Vector3 m0 = (p0 + p1) * l0 / (l0 + l1 + l2);
+        Vector3 m1 = (p1 + p2) * l1 / (l0 + l1 + l2);
+        Vector3 m2 = (p2 + p0) * l2 / (l0 + l1 + l2);
+
+        return m0 + m1 + m2;
+    }
+
+} // namespace
+
+void AtlasBuilder::updateProxy(ChartBuildData * chart)
+{
+    //#pragma message(NV_FILE_LINE "TODO: Use best fit plane instead of average normal.")
+
+    chart->planeNormal = normalizeSafe(chart->normalSum, Vector3(0), 0.0f);
+    chart->centroid = chart->centroidSum / float(chart->faces.count());
+
+    //#pragma message(NV_FILE_LINE "TODO: Experiment with conic fitting.")
+
+    // F = (Nc*Nt - cos Oc)^2 = (x*Nt_x + y*Nt_y + z*Nt_z - cos w)^2
+    // dF/dx = 2 * x * (x*Nt_x + y*Nt_y + z*Nt_z - cos w)
+    // dF/dy = 2 * y * (x*Nt_x + y*Nt_y + z*Nt_z - cos w)
+    // dF/dz = 2 * z * (x*Nt_x + y*Nt_y + z*Nt_z - cos w)
+    // dF/dw = 2 * sin w * (x*Nt_x + y*Nt_y + z*Nt_z - cos w)
+
+    // JacobianMatrix({
+    // 2 * x * (x*Nt_x + y*Nt_y + z*Nt_z - Cos(w)),
+    // 2 * y * (x*Nt_x + y*Nt_y + z*Nt_z - Cos(w)),
+    // 2 * z * (x*Nt_x + y*Nt_y + z*Nt_z - Cos(w)),
+    // 2 * Sin(w) * (x*Nt_x + y*Nt_y + z*Nt_z - Cos(w))}, {x,y,z,w})
+
+    // H[0,0] = 2 * x * Nt_x + 2 * (x*Nt_x + y*Nt_y + z*Nt_z - cos(w));
+    // H[0,1] = 2 * x * Nt_y;
+    // H[0,2] = 2 * x * Nt_z;
+    // H[0,3] = 2 * x * sin(w);
+
+    // H[1,0] = 2 * y * Nt_x;
+    // H[1,1] = 2 * y * Nt_y + 2 * (x*Nt_x + y*Nt_y + z*Nt_z - cos(w));
+    // H[1,2] = 2 * y * Nt_z;
+    // H[1,3] = 2 * y * sin(w);
+
+    // H[2,0] = 2 * z * Nt_x;
+    // H[2,1] = 2 * z * Nt_y;
+    // H[2,2] = 2 * z * Nt_z + 2 * (x*Nt_x + y*Nt_y + z*Nt_z - cos(w));
+    // H[2,3] = 2 * z * sin(w);
+
+    // H[3,0] = 2 * sin(w) * Nt_x;
+    // H[3,1] = 2 * sin(w) * Nt_y;
+    // H[3,2] = 2 * sin(w) * Nt_z;
+    // H[3,3] = 2 * sin(w) * sin(w) + 2 * cos(w) * (x*Nt_x + y*Nt_y + z*Nt_z - cos(w));
+
+    // @@ Cone fitting might be quite slow.
+
+    /*ConeFitting coneFitting(mesh, 0.1f, 0.001f, 0.001f);
+
+    Vector4 start = Vector4(chart->coneAxis, chart->coneAngle);
+    Vector4 solution = coneFitting.solve(chart, start);
+
+    chart->coneAxis = solution.xyz();
+    chart->coneAngle = solution.w;*/
+}
+
+
+
+bool AtlasBuilder::relocateSeeds()
+{
+    bool anySeedChanged = false;
+
+    const uint chartCount = chartArray.count();
+    for (uint i = 0; i < chartCount; i++)
+    {
+        if (relocateSeed(chartArray[i]))
+        {
+            anySeedChanged = true;
+        }
+    }
+
+    return anySeedChanged;
+}
+
+
+bool AtlasBuilder::relocateSeed(ChartBuildData * chart)
+{
+    Vector3 centroid = computeChartCentroid(chart);
+
+    const uint N = 10;  // @@ Hardcoded to 10?
+    PriorityQueue bestTriangles(N); 
+
+    // Find the first N triangles that fit the proxy best.
+    const uint faceCount = chart->faces.count();
+    for (uint i = 0; i < faceCount; i++)
+    {
+        float priority = evaluateProxyFitMetric(chart, chart->faces[i]);
+        bestTriangles.push(priority, chart->faces[i]);
+    }
+
+    // Of those, choose the most central triangle.
+    uint mostCentral;
+    float maxDistance = -1;
+
+    const uint bestCount = bestTriangles.count();
+    for (uint i = 0; i < bestCount; i++)
+    {
+        const HalfEdge::Face * face = mesh->faceAt(bestTriangles.pairs[i].face);
+        Vector3 faceCentroid = triangleCenter(face);
+
+        float distance = length(centroid - faceCentroid);
+
+        /*#pragma message(NV_FILE_LINE "TODO: Implement evaluateDistanceToBoundary.")
+        float distance = evaluateDistanceToBoundary(chart, bestTriangles.pairs[i].face);*/
+        
+        if (distance > maxDistance)
+        {
+            maxDistance = distance;
+            mostCentral = bestTriangles.pairs[i].face;
+        }
+    }
+    nvDebugCheck(maxDistance >= 0);
+
+    // In order to prevent k-means cyles we record all the previously chosen seeds.
+    uint index;
+    if (chart->seeds.find(mostCentral, &index))
+    {
+        // Move new seed to the end of the seed array.
+        uint last = chart->seeds.count() - 1;
+        swap(chart->seeds[index], chart->seeds[last]);
+        return false;
+    }
+    else
+    {
+        // Append new seed.
+        chart->seeds.append(mostCentral);
+        return true;
+    }
+}
+
+void AtlasBuilder::removeCandidate(uint f)
+{
+    int c = faceCandidateArray[f];
+    if (c != -1) {
+        faceCandidateArray[f] = -1;
+
+        if (c == candidateArray.count() - 1) {
+            candidateArray.popBack();
+        }
+        else {
+            candidateArray.replaceWithLast(c);
+            faceCandidateArray[candidateArray[c].face] = c;
+        }
+    }
+}
+
+void AtlasBuilder::updateCandidate(ChartBuildData * chart, uint f, float metric)
+{
+    if (faceCandidateArray[f] == -1) {
+        const uint index = candidateArray.count();
+        faceCandidateArray[f] = index;
+        candidateArray.resize(index + 1);
+        candidateArray[index].face = f;
+        candidateArray[index].chart = chart;
+        candidateArray[index].metric = metric;
+    }
+    else {
+        int c = faceCandidateArray[f];
+        nvDebugCheck(c != -1);
+
+        Candidate & candidate = candidateArray[c];
+        nvDebugCheck(candidate.face == f);
+
+        if (metric < candidate.metric || chart == candidate.chart) {
+            candidate.metric = metric;
+            candidate.chart = chart;
+        }
+    }
+
+}
+
+
+void AtlasBuilder::updatePriorities(ChartBuildData * chart)
+{
+    // Re-evaluate candidate priorities.
+    uint candidateCount = chart->candidates.count();
+    for (uint i = 0; i < candidateCount; i++)
+    {
+        chart->candidates.pairs[i].priority = evaluatePriority(chart, chart->candidates.pairs[i].face);
+
+        if (faceChartArray[chart->candidates.pairs[i].face] == -1)
+        {
+            updateCandidate(chart, chart->candidates.pairs[i].face, chart->candidates.pairs[i].priority);
+        }
+    }
+
+    // Sort candidates.
+    chart->candidates.sort();
+}
+
+
+// Evaluate combined metric.
+float AtlasBuilder::evaluatePriority(ChartBuildData * chart, uint face)
+{
+    // Estimate boundary length and area:
+    float newBoundaryLength = evaluateBoundaryLength(chart, face);
+    float newChartArea = evaluateChartArea(chart, face);
+
+    float F = evaluateProxyFitMetric(chart, face);
+    float C = evaluateRoundnessMetric(chart, face, newBoundaryLength, newChartArea);
+    float P = evaluateStraightnessMetric(chart, face);
+
+    // Penalize faces that cross seams, reward faces that close seams or reach boundaries.
+    float N = evaluateNormalSeamMetric(chart, face);
+    float T = evaluateTextureSeamMetric(chart, face);
+
+    //float R = evaluateCompletenessMetric(chart, face);
+
+    //float D = evaluateDihedralAngleMetric(chart, face);
+    // @@ Add a metric based on local dihedral angle.
+
+    // @@ Tweaking the normal and texture seam metrics.
+    // - Cause more impedance. Never cross 90 degree edges.
+    // - 
+
+    float cost = float(
+        settings.proxyFitMetricWeight * F + 
+        settings.roundnessMetricWeight * C + 
+        settings.straightnessMetricWeight * P +
+        settings.normalSeamMetricWeight * N +
+        settings.textureSeamMetricWeight * T);
+
+    /*cost = settings.proxyFitMetricWeight * powf(F, settings.proxyFitMetricExponent);
+    cost = max(cost, settings.roundnessMetricWeight * powf(C, settings.roundnessMetricExponent));
+    cost = max(cost, settings.straightnessMetricWeight * pow(P, settings.straightnessMetricExponent));
+    cost = max(cost, settings.normalSeamMetricWeight * N);
+    cost = max(cost, settings.textureSeamMetricWeight * T);*/
+
+    // Enforce limits strictly:
+    if (newChartArea > settings.maxChartArea) cost = FLT_MAX;
+    if (newBoundaryLength > settings.maxBoundaryLength) cost = FLT_MAX;
+
+    // Make sure normal seams are fully respected:
+    if (settings.normalSeamMetricWeight >= 1000 && N != 0) cost = FLT_MAX;
+
+    nvCheck(isFinite(cost));
+    return cost;
+}
+
+
+// Returns a value in [0-1].
+float AtlasBuilder::evaluateProxyFitMetric(ChartBuildData * chart, uint f)
+{
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    Vector3 faceNormal = triangleNormal(face);
+    //return square(dot(chart->coneAxis, faceNormal) - cosf(chart->coneAngle));
+
+    // Use plane fitting metric for now:
+    //return square(1 - dot(faceNormal, chart->planeNormal)); // @@ normal deviations should be weighted by face area
+    return 1 - dot(faceNormal, chart->planeNormal); // @@ normal deviations should be weighted by face area
+
+    // Find distance to chart.
+    /*Vector3 faceCentroid = face->centroid();
+
+    float dist = 0;
+    int count = 0;
+
+    for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Edge * edge = it.current();
+
+        if (!edge->isBoundary()) {
+            const HalfEdge::Face * neighborFace = edge->pair()->face();
+            if (faceChartArray[neighborFace->id()] == chart->id) {
+                dist += length(neighborFace->centroid() - faceCentroid);
+                count++;
+            }
+        }
+    }
+
+    dist /= (count * count);
+
+    return (1 - dot(faceNormal, chart->planeNormal)) * dist;*/
+
+    //return (1 - dot(faceNormal, chart->planeNormal));
+}
+
+float AtlasBuilder::evaluateDistanceToBoundary(ChartBuildData * chart, uint face)
+{
+//#pragma message(NV_FILE_LINE "TODO: Evaluate distance to boundary metric.")
+
+    // @@ This is needed for the seed relocation code.
+    // @@ This could provide a better roundness metric.
+    
+    return 0.0f;
+}
+
+float AtlasBuilder::evaluateDistanceToSeed(ChartBuildData * chart, uint f)
+{
+    //const uint seed = chart->seeds.back();
+    //const uint faceCount = mesh->faceCount();
+    //return shortestPaths[seed * faceCount + f];
+
+    const HalfEdge::Face * seed = mesh->faceAt(chart->seeds.back());
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    return length(triangleCenter(seed) - triangleCenter(face));
+}
+
+
+float AtlasBuilder::evaluateRoundnessMetric(ChartBuildData * chart, uint face, float newBoundaryLength, float newChartArea)
+{
+    // @@ D-charts use distance to seed.
+    // C(c,t) = pi * D(S_c,t)^2 / A_c
+    //return PI * square(evaluateDistanceToSeed(chart, face)) / chart->area;
+    //return PI * square(evaluateDistanceToSeed(chart, face)) / chart->area;
+    //return 2 * PI * evaluateDistanceToSeed(chart, face) / chart->boundaryLength;
+
+    // Garland's Hierarchical Face Clustering paper uses ratio between boundary and area, which is easier to compute and might work as well:
+    // roundness = D^2/4*pi*A -> circle = 1, non circle greater than 1
+
+    //return square(newBoundaryLength) / (newChartArea * 4 * PI);
+    float roundness = square(chart->boundaryLength) / chart->area;
+    float newRoundness = square(newBoundaryLength) / newChartArea;
+    if (newRoundness > roundness) {
+        return square(newBoundaryLength) / (newChartArea * 4 * PI);
+    }
+    else {
+        // Offer no impedance to faces that improve roundness.
+        return 0;
+    }
+
+    //return square(newBoundaryLength) / (4 * PI * newChartArea);
+    //return clamp(1 - (4 * PI * newChartArea) / square(newBoundaryLength), 0.0f, 1.0f);
+
+    // Use the ratio between the new roundness vs. the previous roundness.
+    // - If we use the absolute metric, when the initial face is very long, then it's hard to make any progress.
+    //return (square(newBoundaryLength) * chart->area) / (square(chart->boundaryLength) * newChartArea);
+    //return (4 * PI * newChartArea) / square(newBoundaryLength) - (4 * PI * chart->area) / square(chart->boundaryLength);
+
+    //if (square(newBoundaryLength) * chart->area) / (square(chart->boundaryLength) * newChartArea);
+
+}
+
+float AtlasBuilder::evaluateStraightnessMetric(ChartBuildData * chart, uint f)
+{
+    float l_out = 0.0f;
+    float l_in = 0.0f;
+
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Edge * edge = it.current();
+
+        //float l = edge->length();
+        float l = edgeLengths[edge->id/2];
+
+        if (edge->isBoundary())
+        {
+            l_out += l;
+        }
+        else
+        {
+            uint neighborFaceId = edge->pair->face->id;
+            if (faceChartArray[neighborFaceId] != chart->id) {
+                l_out += l;
+            }
+            else {
+                l_in += l;
+            }
+        }
+    }
+    nvDebugCheck(l_in != 0.0f); // Candidate face must be adjacent to chart. @@ This is not true if the input mesh has zero-length edges.
+
+    //return l_out / l_in;
+    float ratio = (l_out - l_in) / (l_out + l_in);
+    //if (ratio < 0) ratio *= 10; // Encourage closing gaps.
+    return min(ratio, 0.0f); // Only use the straightness metric to close gaps.
+    //return ratio;
+}
+
+
+float AtlasBuilder::evaluateNormalSeamMetric(ChartBuildData * chart, uint f)
+{
+    float seamFactor = 0.0f;
+    float totalLength = 0.0f;
+    
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Edge * edge = it.current();
+
+        if (edge->isBoundary()) {
+            continue;
+        }
+
+        const uint neighborFaceId = edge->pair->face->id;
+        if (faceChartArray[neighborFaceId] != chart->id) {
+            continue;
+        }
+
+        //float l = edge->length();
+        float l = edgeLengths[edge->id/2];
+
+        totalLength += l;
+
+        if (!edge->isSeam()) {
+            continue;
+        }
+
+        // Make sure it's a normal seam.
+        if (isNormalSeam(edge))
+        {
+            float d0 = clamp(dot(edge->vertex->nor, edge->pair->next->vertex->nor), 0.0f, 1.0f);
+            float d1 = clamp(dot(edge->next->vertex->nor, edge->pair->vertex->nor), 0.0f, 1.0f);
+            //float a0 = clamp(acosf(d0) / (PI/2), 0.0f, 1.0f);
+            //float a1 = clamp(acosf(d1) / (PI/2), 0.0f, 1.0f);
+            //l *= (a0 + a1) * 0.5f;
+
+            l *= 1 - (d0 + d1) * 0.5f;
+
+            seamFactor += l;
+        }
+    }
+
+    if (seamFactor == 0) return 0.0f;
+    return seamFactor / totalLength;
+}
+
+
+float AtlasBuilder::evaluateTextureSeamMetric(ChartBuildData * chart, uint f)
+{
+    float seamLength = 0.0f;
+    //float newSeamLength = 0.0f;
+    //float oldSeamLength = 0.0f;
+    float totalLength = 0.0f;
+    
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Edge * edge = it.current();
+
+        /*float l = edge->length();
+        totalLength += l;
+
+        if (edge->isBoundary() || !edge->isSeam()) {
+            continue;
+        }
+
+        // Make sure it's a texture seam.
+        if (isTextureSeam(edge))
+        {
+            uint neighborFaceId = edge->pair()->face()->id();
+            if (faceChartArray[neighborFaceId] != chart->id) {
+                newSeamLength += l;
+            }
+            else {
+                oldSeamLength += l;
+            }
+        }*/
+
+        if (edge->isBoundary()) {
+            continue;
+        }
+
+        const uint neighborFaceId = edge->pair->face->id;
+        if (faceChartArray[neighborFaceId] != chart->id) {
+            continue;
+        }
+
+        //float l = edge->length();
+        float l = edgeLengths[edge->id/2];
+        totalLength += l;
+
+        if (!edge->isSeam()) {
+            continue;
+        }
+
+        // Make sure it's a texture seam.
+        if (isTextureSeam(edge))
+        {
+            seamLength += l;
+        }
+    }
+
+    if (seamLength == 0.0f) {
+        return 0.0f; // Avoid division by zero.
+    }
+    
+    return seamLength / totalLength;
+}
+
+
+float AtlasBuilder::evaluateSeamMetric(ChartBuildData * chart, uint f)
+{
+    float newSeamLength = 0.0f;
+    float oldSeamLength = 0.0f;
+    float totalLength = 0.0f;
+    
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Edge * edge = it.current();
+
+        //float l = edge->length();
+        float l = edgeLengths[edge->id/2];
+
+        if (edge->isBoundary())
+        {
+            newSeamLength += l;
+        }
+        else
+        {
+            if (edge->isSeam())
+            {
+                uint neighborFaceId = edge->pair->face->id;
+                if (faceChartArray[neighborFaceId] != chart->id) {
+                    newSeamLength += l;
+                }
+                else {
+                    oldSeamLength += l;
+                }
+            }
+        }
+
+        totalLength += l;
+    }
+
+    return (newSeamLength - oldSeamLength) / totalLength;
+}
+
+
+float AtlasBuilder::evaluateChartArea(ChartBuildData * chart, uint f)
+{
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    //return chart->area + face->area();
+    return chart->area + faceAreas[face->id];
+}
+
+
+float AtlasBuilder::evaluateBoundaryLength(ChartBuildData * chart, uint f)
+{
+    float boundaryLength = chart->boundaryLength;
+
+    // Add new edges, subtract edges shared with the chart.
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Edge * edge = it.current();
+        //float edgeLength = edge->length();
+        float edgeLength = edgeLengths[edge->id/2];
+
+        if (edge->isBoundary())
+        {
+            boundaryLength += edgeLength;
+        }
+        else
+        {
+            uint neighborFaceId = edge->pair->face->id;
+            if (faceChartArray[neighborFaceId] != chart->id) {
+                boundaryLength += edgeLength;
+            }
+            else {
+                boundaryLength -= edgeLength;
+            }
+        }
+    }
+    //nvDebugCheck(boundaryLength >= 0);
+
+    return max(0.0f, boundaryLength);  // @@ Hack!
+}
+
+Vector3 AtlasBuilder::evaluateChartNormalSum(ChartBuildData * chart, uint f)
+{
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    return chart->normalSum + triangleNormalAreaScaled(face);
+}
+
+Vector3 AtlasBuilder::evaluateChartCentroidSum(ChartBuildData * chart, uint f)
+{
+    const HalfEdge::Face * face = mesh->faceAt(f);
+    return chart->centroidSum + face->centroid();
+}
+
+
+Vector3 AtlasBuilder::computeChartCentroid(const ChartBuildData * chart)
+{
+    Vector3 centroid(0);
+
+    const uint faceCount = chart->faces.count();
+    for (uint i = 0; i < faceCount; i++)
+    {
+        const HalfEdge::Face * face = mesh->faceAt(chart->faces[i]);
+        centroid += triangleCenter(face);
+    }
+
+    return centroid / float(faceCount);
+}
+
+
+void AtlasBuilder::fillHoles(float threshold)
+{
+    while (facesLeft > 0)
+    {
+        createRandomChart(threshold);
+    }
+}
+
+
+void AtlasBuilder::mergeChart(ChartBuildData * owner, ChartBuildData * chart, float sharedBoundaryLength)
+{
+    const uint faceCount = chart->faces.count();
+    for (uint i = 0; i < faceCount; i++)
+    {
+        uint f = chart->faces[i];
+        
+        nvDebugCheck(faceChartArray[f] == chart->id);
+        faceChartArray[f] = owner->id;
+
+        owner->faces.append(f);
+    }
+
+    // Update adjacencies?
+
+    owner->area += chart->area;
+    owner->boundaryLength += chart->boundaryLength - sharedBoundaryLength;
+
+    owner->normalSum += chart->normalSum;
+    owner->centroidSum += chart->centroidSum;
+
+    updateProxy(owner);
+}
+
+void AtlasBuilder::mergeCharts()
+{
+    Array<float> sharedBoundaryLengths;
+
+    const uint chartCount = chartArray.count();
+    for (int c = chartCount-1; c >= 0; c--)
+    {
+        sharedBoundaryLengths.clear();
+        sharedBoundaryLengths.resize(chartCount, 0.0f);
+
+        ChartBuildData * chart = chartArray[c];
+
+        float externalBoundary = 0.0f;
+
+        const uint faceCount = chart->faces.count();
+        for (uint i = 0; i < faceCount; i++)
+        {
+            uint f = chart->faces[i];
+            const HalfEdge::Face * face = mesh->faceAt(f);
+
+            for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+            {
+                const HalfEdge::Edge * edge = it.current();
+
+                //float l = edge->length();
+                float l = edgeLengths[edge->id/2];
+
+                if (edge->isBoundary()) {
+                    externalBoundary += l;
+                }
+                else {
+                    uint neighborFace = edge->pair->face->id;
+                    uint neighborChart = faceChartArray[neighborFace];
+
+                    if (neighborChart != c) {
+                        if ((edge->isSeam() && (isNormalSeam(edge) || isTextureSeam(edge))) || neighborChart == -2) {
+                            externalBoundary += l;
+                        }
+                        else {
+                            sharedBoundaryLengths[neighborChart] += l;
+                        }
+                    }
+                }
+            }
+        }
+
+        for (int cc = chartCount-1; cc >= 0; cc--)
+        {
+            if (cc == c) 
+                continue;
+
+            ChartBuildData * chart2 = chartArray[cc];
+            if (chart2 == NULL) 
+                continue;
+
+            if (sharedBoundaryLengths[cc] > 0.8 * max(0.0f, chart->boundaryLength - externalBoundary)) {
+
+                // Try to avoid degenerate configurations.
+                if (chart2->boundaryLength > sharedBoundaryLengths[cc])
+                {
+                    if (dot(chart2->planeNormal, chart->planeNormal) > -0.25) {
+                        mergeChart(chart2, chart, sharedBoundaryLengths[cc]);
+                        delete chart;
+                        chartArray[c] = NULL;
+                        break;
+                    }
+                }
+            }
+
+            if (sharedBoundaryLengths[cc] > 0.20 * max(0.0f, chart->boundaryLength - externalBoundary)) {
+
+                // Compare proxies.
+                if (dot(chart2->planeNormal, chart->planeNormal) > 0) {
+                    mergeChart(chart2, chart, sharedBoundaryLengths[cc]);
+                    delete chart;
+                    chartArray[c] = NULL;
+                    break;
+                }
+            }
+        }
+    }
+
+    // Remove deleted charts.
+    for (int c = 0; c < I32(chartArray.count()); /*do not increment if removed*/)
+    {
+        if (chartArray[c] == NULL) {
+            chartArray.removeAt(c);
+
+            // Update faceChartArray.
+            const uint faceCount = faceChartArray.count();
+            for (uint i = 0; i < faceCount; i++) {
+                nvDebugCheck (faceChartArray[i] != -1);
+                nvDebugCheck (faceChartArray[i] != c);
+                nvDebugCheck (faceChartArray[i] <= I32(chartArray.count()));
+
+                if (faceChartArray[i] > c) {
+                    faceChartArray[i]--;
+                }
+            }
+        }
+        else {
+            chartArray[c]->id = c;
+            c++;
+        }
+    }
+}
+
+
+
+const Array<uint> & AtlasBuilder::chartFaces(uint i) const
+{
+    return chartArray[i]->faces;
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/param/AtlasBuilder.h b/thirdparty/thekla_atlas/nvmesh/param/AtlasBuilder.h
new file mode 100644
index 0000000000..f25c724f7e
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/AtlasBuilder.h
@@ -0,0 +1,111 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MESH_ATLASBUILDER_H
+#define NV_MESH_ATLASBUILDER_H
+
+#include "Atlas.h"
+
+#include "nvmath/Vector.h"
+#include "nvmath/Random.h"
+#include "nvmesh/nvmesh.h"
+
+#include "nvcore/Array.h"
+#include "nvcore/BitArray.h"
+
+
+
+namespace nv
+{
+    namespace HalfEdge { class Mesh; }
+
+    struct ChartBuildData;
+
+    struct AtlasBuilder
+    {
+        AtlasBuilder(const HalfEdge::Mesh * m);
+        ~AtlasBuilder();
+
+        void markUnchartedFaces(const Array<uint> & unchartedFaces);
+
+        void computeShortestPaths();
+
+        void placeSeeds(float threshold, uint maxSeedCount);
+        void createRandomChart(float threshold);
+
+        void addFaceToChart(ChartBuildData * chart, uint f, bool recomputeProxy=false);
+
+        bool growCharts(float threshold, uint faceCount);
+        bool growChart(ChartBuildData * chart, float threshold, uint faceCount);
+
+        void resetCharts();
+
+        void updateCandidates(ChartBuildData * chart, uint face);
+
+        void updateProxies();
+        void updateProxy(ChartBuildData * chart);
+
+        bool relocateSeeds();
+        bool relocateSeed(ChartBuildData * chart);
+
+        void updatePriorities(ChartBuildData * chart);
+
+        float evaluatePriority(ChartBuildData * chart, uint face);
+        float evaluateProxyFitMetric(ChartBuildData * chart, uint face);
+        float evaluateDistanceToBoundary(ChartBuildData * chart, uint face);
+        float evaluateDistanceToSeed(ChartBuildData * chart, uint face);
+        float evaluateRoundnessMetric(ChartBuildData * chart, uint face, float newBoundaryLength, float newChartArea);
+        float evaluateStraightnessMetric(ChartBuildData * chart, uint face);
+
+        float evaluateNormalSeamMetric(ChartBuildData * chart, uint f);
+        float evaluateTextureSeamMetric(ChartBuildData * chart, uint f);
+        float evaluateSeamMetric(ChartBuildData * chart, uint f);
+
+        float evaluateChartArea(ChartBuildData * chart, uint f);
+        float evaluateBoundaryLength(ChartBuildData * chart, uint f);
+        Vector3 evaluateChartNormalSum(ChartBuildData * chart, uint f);
+        Vector3 evaluateChartCentroidSum(ChartBuildData * chart, uint f);
+
+        Vector3 computeChartCentroid(const ChartBuildData * chart);
+
+
+        void fillHoles(float threshold);
+        void mergeCharts();
+
+        // @@ Cleanup.
+        struct Candidate {
+            uint face;
+            ChartBuildData * chart;
+            float metric;
+        };
+
+        const Candidate & getBestCandidate() const;
+        void removeCandidate(uint f);
+        void updateCandidate(ChartBuildData * chart, uint f, float metric);
+
+        void mergeChart(ChartBuildData * owner, ChartBuildData * chart, float sharedBoundaryLength);
+
+
+        uint chartCount() const { return chartArray.count(); }
+        const Array<uint> & chartFaces(uint i) const;
+
+        const HalfEdge::Mesh * mesh;
+        uint facesLeft;
+        Array<int> faceChartArray;
+        Array<ChartBuildData *> chartArray;
+        Array<float> shortestPaths;
+
+        Array<float> edgeLengths;
+        Array<float> faceAreas;
+
+        Array<Candidate> candidateArray; //
+        Array<uint> faceCandidateArray; // Map face index to candidate index.
+
+        MTRand rand;
+
+        SegmentationSettings settings;
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_ATLASBUILDER_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/AtlasPacker.cpp b/thirdparty/thekla_atlas/nvmesh/param/AtlasPacker.cpp
new file mode 100644
index 0000000000..5ce452cb9e
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/AtlasPacker.cpp
@@ -0,0 +1,1387 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "nvmesh.h" // pch
+
+#include "AtlasPacker.h"
+#include "nvmesh/halfedge/Vertex.h"
+#include "nvmesh/halfedge/Face.h"
+#include "nvmesh/param/Atlas.h"
+#include "nvmesh/param/Util.h"
+#include "nvmesh/raster/Raster.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/ConvexHull.h"
+#include "nvmath/Color.h"
+#include "nvmath/ftoi.h"
+
+#include "nvcore/StrLib.h" // debug
+#include "nvcore/StdStream.h" // fileOpen
+
+#include <float.h> // FLT_MAX
+#include <limits.h> // UINT_MAX
+
+using namespace nv;
+
+#define DEBUG_OUTPUT 0
+
+#if DEBUG_OUTPUT
+
+#include "nvimage/ImageIO.h"
+
+namespace
+{
+    const uint TGA_TYPE_GREY = 3;
+    const uint TGA_TYPE_RGB = 2;
+    const uint TGA_ORIGIN_UPPER = 0x20;
+
+#pragma pack(push, 1)
+    struct TgaHeader {
+	    uint8	id_length;
+	    uint8	colormap_type;
+	    uint8	image_type;
+	    uint16	colormap_index;
+	    uint16	colormap_length;
+	    uint8	colormap_size;
+	    uint16	x_origin;
+	    uint16	y_origin;
+	    uint16	width;
+	    uint16	height;
+	    uint8	pixel_size;
+	    uint8	flags;
+
+	    enum { Size = 18 };		//const static int SIZE = 18;
+    };
+#pragma pack(pop)
+
+    static void outputDebugBitmap(const char * fileName, const BitMap & bitmap, int w, int h)
+    {
+        FILE * fp = fileOpen(fileName, "wb");
+        if (fp == NULL) return;
+        
+        nvStaticCheck(sizeof(TgaHeader) == TgaHeader::Size);
+	    TgaHeader tga;
+	    tga.id_length = 0;
+	    tga.colormap_type = 0;
+	    tga.image_type = TGA_TYPE_GREY;
+
+	    tga.colormap_index = 0;
+	    tga.colormap_length = 0;
+	    tga.colormap_size = 0;
+
+	    tga.x_origin = 0;
+	    tga.y_origin = 0;
+        tga.width = w;
+        tga.height = h;
+	    tga.pixel_size = 8;
+	    tga.flags = TGA_ORIGIN_UPPER;
+
+        fwrite(&tga, sizeof(TgaHeader), 1, fp);
+
+		for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                uint8 color = bitmap.bitAt(i, j) ? 0xFF : 0x0;
+                fwrite(&color, 1, 1, fp);
+            }
+		}
+
+        fclose(fp);
+    }
+
+    static void outputDebugImage(const char * fileName, const Image & bitmap, int w, int h)
+    {
+        FILE * fp = fileOpen(fileName, "wb");
+        if (fp == NULL) return;
+        
+        nvStaticCheck(sizeof(TgaHeader) == TgaHeader::Size);
+	    TgaHeader tga;
+	    tga.id_length = 0;
+	    tga.colormap_type = 0;
+        tga.image_type = TGA_TYPE_RGB;
+
+	    tga.colormap_index = 0;
+	    tga.colormap_length = 0;
+	    tga.colormap_size = 0;
+
+	    tga.x_origin = 0;
+	    tga.y_origin = 0;
+        tga.width = w;
+        tga.height = h;
+	    tga.pixel_size = 24;
+	    tga.flags = TGA_ORIGIN_UPPER;
+
+        fwrite(&tga, sizeof(TgaHeader), 1, fp);
+
+		for (int j = 0; j < h; j++) {
+            for (int i = 0; i < w; i++) {
+                Color32 color = bitmap.pixel(i, j);
+                fwrite(&color.r, 1, 1, fp);
+                fwrite(&color.g, 1, 1, fp);
+                fwrite(&color.b, 1, 1, fp);
+            }
+		}
+
+        fclose(fp);
+    }
+}
+
+#endif // DEBUG_OUTPUT
+
+inline int align(int x, int a) {
+    //return a * ((x + a - 1) / a);
+    //return (x + a - 1) & -a;
+    return (x + a - 1) & ~(a - 1);
+}
+
+inline bool isAligned(int x, int a) {
+    return (x & (a - 1)) == 0;
+}
+
+
+
+AtlasPacker::AtlasPacker(Atlas * atlas) : m_atlas(atlas), m_bitmap(256, 256)
+{
+    m_width = 0;
+    m_height = 0;
+    
+    m_debug_bitmap.allocate(256, 256);
+    m_debug_bitmap.fill(Color32(0,0,0,0));
+}
+
+AtlasPacker::~AtlasPacker()
+{
+}
+
+// This should compute convex hull and use rotating calipers to find the best box. Currently it uses a brute force method.
+static bool computeBoundingBox(Chart * chart, Vector2 * majorAxis, Vector2 * minorAxis, Vector2 * minCorner, Vector2 * maxCorner)
+{
+    // Compute list of boundary points.
+    Array<Vector2> points(16);
+
+    HalfEdge::Mesh * mesh = chart->chartMesh();
+    const uint vertexCount = mesh->vertexCount();
+
+    for (uint i = 0; i < vertexCount; i++) {
+        HalfEdge::Vertex * vertex = mesh->vertexAt(i);
+        if (vertex->isBoundary()) {
+            points.append(vertex->tex);
+        }
+    }
+
+    // This is not valid anymore. The chart mesh may have multiple boundaries!
+    /*const HalfEdge::Vertex * vertex = findBoundaryVertex(chart->chartMesh());
+
+    // Traverse boundary.
+    const HalfEdge::Edge * const firstEdge = vertex->edge();
+    const HalfEdge::Edge * edge = firstEdge;
+    do {
+        vertex = edge->vertex();
+
+        nvDebugCheck (vertex->isBoundary());
+        points.append(vertex->tex);
+
+        edge = edge->next();
+    } while (edge != firstEdge);*/
+
+#if 1
+    Array<Vector2> hull;
+    if (points.size()==0) {
+        return false;    
+    }
+    
+    convexHull(points, hull, 0.00001f);
+
+    // @@ Ideally I should use rotating calipers to find the best box. Using brute force for now.
+
+    float best_area = FLT_MAX;
+    Vector2 best_min;
+    Vector2 best_max;
+    Vector2 best_axis;
+
+    const uint hullCount = hull.count();
+    for (uint i = 0, j = hullCount-1; i < hullCount; j = i, i++) {
+
+        if (equal(hull[i], hull[j])) {
+            continue;
+        }
+
+        Vector2 axis = normalize(hull[i] - hull[j], 0.0f);
+        nvDebugCheck(isFinite(axis));
+
+        // Compute bounding box.
+        Vector2 box_min(FLT_MAX, FLT_MAX);
+        Vector2 box_max(-FLT_MAX, -FLT_MAX);
+
+        for (uint v = 0; v < hullCount; v++) {
+
+           Vector2 point = hull[v];
+
+           float x = dot(axis, point);
+           if (x < box_min.x) box_min.x = x;
+           if (x > box_max.x) box_max.x = x;
+
+           float y = dot(Vector2(-axis.y, axis.x), point);
+           if (y < box_min.y) box_min.y = y;
+           if (y > box_max.y) box_max.y = y;
+        }
+    
+        // Compute box area.
+        float area = (box_max.x - box_min.x) * (box_max.y - box_min.y);
+
+        if (area < best_area) {
+            best_area = area;
+            best_min = box_min;
+            best_max = box_max;
+            best_axis = axis;
+        }
+    }
+
+    // Make sure the box contains all the input points since the convex hull is not 100% accurate.
+    /*const uint pointCount = points.count();
+    for (uint v = 0; v < pointCount; v++) {
+
+        Vector2 point = points[v];
+
+        float x = dot(best_axis, point);
+        if (x < best_min.x) best_min.x = x;
+
+        float y = dot(Vector2(-best_axis.y, best_axis.x), point);
+        if (y < best_min.y) best_min.y = y;
+    }*/
+
+    // Consider all points, not only boundary points, in case the input chart is malformed.
+    for (uint i = 0; i < vertexCount; i++) {
+        HalfEdge::Vertex * vertex = mesh->vertexAt(i);
+        Vector2 point = vertex->tex;
+
+        float x = dot(best_axis, point);
+        if (x < best_min.x) best_min.x = x;
+        if (x > best_max.x) best_max.x = x;
+
+        float y = dot(Vector2(-best_axis.y, best_axis.x), point);
+        if (y < best_min.y) best_min.y = y;
+        if (y > best_max.y) best_max.y = y;
+    }
+
+    *majorAxis = best_axis;
+    *minorAxis = Vector2(-best_axis.y, best_axis.x);
+    *minCorner = best_min;
+    *maxCorner = best_max;
+
+#else
+    // Approximate implementation: try 16 different directions and keep the best.
+
+    const uint N = 16;
+    Vector2 axis[N];
+
+    float minAngle = 0;
+    float maxAngle = PI / 2;
+
+    int best;
+    Vector2 mins[N];
+    Vector2 maxs[N];
+
+    const int iterationCount = 1;
+    for (int j = 0; j < iterationCount; j++)
+    {
+        // Init predefined directions.
+        for (int i = 0; i < N; i++)
+        {
+            float angle = lerp(minAngle, maxAngle, float(i)/N);
+            axis[i].set(cosf(angle), sinf(angle));
+        }
+
+        // Compute box for each direction.
+        for (int i = 0; i < N; i++)
+        {
+            mins[i].set(FLT_MAX, FLT_MAX);
+            maxs[i].set(-FLT_MAX, -FLT_MAX);
+        }
+
+        for (uint p = 0; p < points.count(); p++)
+        {
+            Vector2 point = points[p];
+
+            for (int i = 0; i < N; i++)
+            {
+               float x = dot(axis[i], point);
+               if (x < mins[i].x) mins[i].x = x;
+               if (x > maxs[i].x) maxs[i].x = x;
+
+               float y = dot(Vector2(-axis[i].y, axis[i].x), point);
+               if (y < mins[i].y) mins[i].y = y;
+               if (y > maxs[i].y) maxs[i].y = y;
+            }
+        }
+
+        // Find box with minimum area.
+        best = -1;
+        int second_best = -1;
+        float best_area = FLT_MAX;
+        float second_best_area = FLT_MAX;
+        
+        for (int i = 0; i < N; i++)
+        {
+            float area = (maxs[i].x - mins[i].x) * (maxs[i].y - mins[i].y);
+
+            if (area < best_area)
+            {
+                second_best_area = best_area;
+                second_best = best;
+
+                best_area = area;
+                best = i;
+            }
+            else if (area < second_best_area)
+            {
+                second_best_area = area;
+                second_best = i;
+            }
+        }
+        nvDebugCheck(best != -1);
+        nvDebugCheck(second_best != -1);
+        nvDebugCheck(best != second_best);
+
+        if (j != iterationCount-1)
+        {
+            // Handle wrap-around during the first iteration.
+            if (j == 0) {
+                if (best == 0 && second_best == N-1) best = N;
+                if (best == N-1 && second_best == 0) second_best = N;
+            }
+
+            if (best < second_best) swap(best, second_best);
+
+            // Update angles.
+            float deltaAngle = (maxAngle - minAngle) / N;
+            maxAngle = minAngle + (best - 0.5f)  * deltaAngle;
+            minAngle = minAngle + (second_best + 0.5f) * deltaAngle;
+        }
+    }
+
+    // Compute major and minor axis, and origin.
+    *majorAxis = axis[best];
+    *minorAxis = Vector2(-axis[best].y, axis[best].x);
+    *origin = mins[best];
+
+    // @@ If the parameterization is invalid, we could have an interior vertex outside the boundary.
+    // @@ In that case the returned bounding box would be incorrect. Compute updated bounds here.
+    /*for (uint p = 0; p < points.count(); p++)
+    {
+        Vector2 point = points[p];
+
+        for (int i = 0; i < N; i++)
+        {
+           float x = dot(*majorAxis, point);
+           float y = dot(*minorAxis, point);
+        }
+    }*/
+#endif
+
+    return true;
+}
+
+
+void AtlasPacker::packCharts(int quality, float texelsPerUnit, bool blockAligned, bool conservative)
+{
+    const uint chartCount = m_atlas->chartCount();
+    if (chartCount == 0) return;
+
+    Array<float> chartOrderArray;
+    chartOrderArray.resize(chartCount);
+
+    Array<Vector2> chartExtents;
+    chartExtents.resize(chartCount);
+    
+    float meshArea = 0;
+    for (uint c = 0; c < chartCount; c++)
+    {
+        Chart * chart = m_atlas->chartAt(c);
+        
+        if (!chart->isVertexMapped() && !chart->isDisk()) {
+            chartOrderArray[c] = 0;
+
+            // Skip non-disks.
+            continue;
+        }
+
+        Vector2 extents(0.0f);
+
+        if (chart->isVertexMapped()) {
+            // Let's assume vertex maps are arranged in a rectangle.
+            //HalfEdge::Mesh * mesh = chart->chartMesh();
+
+            // Arrange vertices in a rectangle.
+            extents.x = float(chart->vertexMapWidth);
+            extents.y = float(chart->vertexMapHeight);
+        }
+        else {
+            // Compute surface area to sort charts.
+            float chartArea = chart->computeSurfaceArea();
+            meshArea += chartArea;
+            //chartOrderArray[c] = chartArea;
+
+            // Compute chart scale
+            float parametricArea = fabs(chart->computeParametricArea());    // @@ There doesn't seem to be anything preventing parametric area to be negative.
+            if (parametricArea < NV_EPSILON) {
+                // When the parametric area is too small we use a rough approximation to prevent divisions by very small numbers.
+                Vector2 bounds = chart->computeParametricBounds();
+                parametricArea = bounds.x * bounds.y;
+            }
+            float scale = (chartArea / parametricArea) * texelsPerUnit;
+            if (parametricArea == 0) // < NV_EPSILON)
+            {
+                scale = 0;
+            }
+            nvCheck(isFinite(scale));
+
+            // Compute bounding box of chart.
+            Vector2 majorAxis, minorAxis, origin, end;
+            if (!computeBoundingBox(chart, &majorAxis, &minorAxis, &origin, &end)) {
+                m_atlas->setFailed();
+                return;
+            }
+
+            nvCheck(isFinite(majorAxis) && isFinite(minorAxis) && isFinite(origin));
+            
+            // Sort charts by perimeter. @@ This is sometimes producing somewhat unexpected results. Is this right?
+            //chartOrderArray[c] = ((end.x - origin.x) + (end.y - origin.y)) * scale;
+
+            // Translate, rotate and scale vertices. Compute extents.
+            HalfEdge::Mesh * mesh = chart->chartMesh();
+            const uint vertexCount = mesh->vertexCount();
+            for (uint i = 0; i < vertexCount; i++)
+            {
+                HalfEdge::Vertex * vertex = mesh->vertexAt(i);
+
+                //Vector2 t = vertex->tex - origin;
+                Vector2 tmp;
+                tmp.x = dot(vertex->tex, majorAxis);
+                tmp.y = dot(vertex->tex, minorAxis);
+                tmp -= origin;
+                tmp *= scale;
+                if (tmp.x < 0 || tmp.y < 0) {
+                    nvDebug("tmp: %f %f\n", tmp.x, tmp.y);
+                    nvDebug("scale: %f\n", scale);
+                    nvDebug("origin: %f %f\n", origin.x, origin.y);
+                    nvDebug("majorAxis: %f %f\n", majorAxis.x, majorAxis.y);
+                    nvDebug("minorAxis: %f %f\n", minorAxis.x, minorAxis.y);
+                    nvDebugBreak();
+                }
+                //nvCheck(tmp.x >= 0 && tmp.y >= 0);
+
+                vertex->tex = tmp;
+
+				nvCheck(isFinite(vertex->tex.x) && isFinite(vertex->tex.y));
+
+                extents = max(extents, tmp);
+            }
+            nvDebugCheck(extents.x >= 0 && extents.y >= 0);
+
+            // Limit chart size.
+            if (extents.x > 1024 || extents.y > 1024) {
+                float limit = max(extents.x, extents.y);
+
+                scale = 1024 / (limit + 1);
+
+                for (uint i = 0; i < vertexCount; i++)
+                {
+                    HalfEdge::Vertex * vertex = mesh->vertexAt(i);
+                    vertex->tex *= scale;
+                }
+
+                extents *= scale;
+
+                nvDebugCheck(extents.x <= 1024 && extents.y <= 1024);
+            }
+
+
+            // Scale the charts to use the entire texel area available. So, if the width is 0.1 we could scale it to 1 without increasing the lightmap usage and making a better 
+            // use of it. In many cases this also improves the look of the seams, since vertices on the chart boundaries have more chances of being aligned with the texel centers.
+
+            float scale_x = 1.0f;
+            float scale_y = 1.0f;
+
+            float divide_x = 1.0f;
+            float divide_y = 1.0f;
+
+            if (extents.x > 0) {
+                int cw = ftoi_ceil(extents.x);
+
+                if (blockAligned) {
+                    // Align all chart extents to 4x4 blocks, but taking padding into account.
+                    if (conservative) {
+                        cw = align(cw + 2, 4) - 2;
+                    }
+                    else {
+                        cw = align(cw + 1, 4) - 1;
+                    }
+                }
+
+                scale_x = (float(cw) - NV_EPSILON);
+                divide_x = extents.x;
+                extents.x = float(cw);
+            }
+
+            if (extents.y > 0) {
+                int ch = ftoi_ceil(extents.y);
+
+                if (blockAligned) {
+                    // Align all chart extents to 4x4 blocks, but taking padding into account.
+                    if (conservative) {
+                        ch = align(ch + 2, 4) - 2;
+                    }
+                    else {
+                        ch = align(ch + 1, 4) - 1;
+                    }
+                }
+
+                scale_y = (float(ch) - NV_EPSILON);
+                divide_y = extents.y;
+                extents.y = float(ch);
+            }
+
+            for (uint v = 0; v < vertexCount; v++) {
+                HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+
+                vertex->tex.x /= divide_x;
+                vertex->tex.y /= divide_y;
+                vertex->tex.x *= scale_x;
+                vertex->tex.y *= scale_y;
+
+				nvCheck(isFinite(vertex->tex.x) && isFinite(vertex->tex.y));
+            }
+        }
+
+        chartExtents[c] = extents;
+
+        // Sort charts by perimeter.
+        chartOrderArray[c] = extents.x + extents.y;
+    }
+
+    // @@ We can try to improve compression of small charts by sorting them by proximity like we do with vertex samples.
+    // @@ How to do that? One idea: compute chart centroid, insert into grid, compute morton index of the cell, sort based on morton index.
+    // @@ We would sort by morton index, first, then quantize the chart sizes, so that all small charts have the same size, and sort by size preserving the morton order.
+
+    //nvDebug("Sorting charts.\n");
+
+    // Sort charts by area.
+    m_radix.sort(chartOrderArray);
+    const uint32 * ranks = m_radix.ranks();
+
+    // Estimate size of the map based on the mesh surface area and given texel scale.
+    float texelCount = meshArea * square(texelsPerUnit) / 0.75f; // Assume 75% utilization.
+    if (texelCount < 1) texelCount = 1;
+    uint approximateExtent = nextPowerOfTwo(uint(sqrtf(texelCount)));
+
+    //nvDebug("Init bitmap.\n");
+
+    // @@ Pack all charts smaller than a texel into a compact rectangle.
+    // @@ Start considering only 1x1 charts. Extend to 1xn charts later.
+
+    /*for (uint i = 0; i < chartCount; i++)
+    {
+        uint c = ranks[chartCount - i - 1]; // largest chart first
+
+        Chart * chart = m_atlas->chartAt(c);
+
+        if (!chart->isDisk()) continue;
+
+        if (iceil(chartExtents[c].x) == 1 && iceil(chartExtents[c].x) == 1) {
+            // @@ Add to 
+        }
+    }*/
+
+
+
+    // Init bit map.
+    m_bitmap.clearAll();
+    if (approximateExtent > m_bitmap.width()) {
+        m_bitmap.resize(approximateExtent, approximateExtent, false);
+        m_debug_bitmap.resize(approximateExtent, approximateExtent);
+        m_debug_bitmap.fill(Color32(0,0,0,0));
+    }
+
+    
+    int w = 0;
+    int h = 0;
+
+#if 1
+    // Add sorted charts to bitmap.
+    for (uint i = 0; i < chartCount; i++)
+    {
+        uint c = ranks[chartCount - i - 1]; // largest chart first
+
+        Chart * chart = m_atlas->chartAt(c);
+
+        if (!chart->isVertexMapped() && !chart->isDisk()) continue;
+
+        //float scale_x = 1;
+        //float scale_y = 1;
+
+        BitMap chart_bitmap;
+
+        if (chart->isVertexMapped()) {
+            // Init all bits to 1.
+            chart_bitmap.resize(ftoi_ceil(chartExtents[c].x), ftoi_ceil(chartExtents[c].y), /*initValue=*/true);
+
+            // @@ Another alternative would be to try to map each vertex to a different texel trying to fill all the available unused texels.
+        }
+        else {
+            // @@ Add special cases for dot and line charts. @@ Lightmap rasterizer also needs to handle these special cases.
+            // @@ We could also have a special case for chart quads. If the quad surface <= 4 texels, align vertices with texel centers and do not add padding. May be very useful for foliage.
+
+            // @@ In general we could reduce the padding of all charts by one texel by using a rasterizer that takes into account the 2-texel footprint of the tent bilinear filter. For example,
+            // if we have a chart that is less than 1 texel wide currently we add one texel to the left and one texel to the right creating a 3-texel-wide bitmap. However, if we know that the 
+            // chart is only 1 texel wide we could align it so that it only touches the footprint of two texels:
+
+            //      |   |      <- Touches texels 0, 1 and 2.
+            //    |   |        <- Only touches texels 0 and 1.
+            // \   \ / \ /   /
+            //  \   X   X   /
+            //   \ / \ / \ /
+            //    V   V   V
+            //    0   1   2
+
+            if (conservative) {
+                // Init all bits to 0.
+                chart_bitmap.resize(ftoi_ceil(chartExtents[c].x) + 2, ftoi_ceil(chartExtents[c].y) + 2, /*initValue=*/false);  // + 2 to add padding on both sides.
+
+                // Rasterize chart and dilate.
+                drawChartBitmapDilate(chart, &chart_bitmap, /*padding=*/1);
+            }
+            else {
+                // Init all bits to 0.
+                chart_bitmap.resize(ftoi_ceil(chartExtents[c].x) + 1, ftoi_ceil(chartExtents[c].y) + 1, /*initValue=*/false);  // Add half a texels on each side.
+
+                // Rasterize chart and dilate.
+                drawChartBitmap(chart, &chart_bitmap, Vector2(1), Vector2(0.5));
+            }
+        }
+
+        int best_x, best_y;
+        int best_cw, best_ch;   // Includes padding now.
+        int best_r;
+        findChartLocation(quality, &chart_bitmap, chartExtents[c], w, h, &best_x, &best_y, &best_cw, &best_ch, &best_r);
+        
+        /*if (w < best_x + best_cw || h < best_y + best_ch)
+        {
+            nvDebug("Resize extents to (%d, %d).\n", best_x + best_cw, best_y + best_ch);
+        }*/
+
+        // Update parametric extents.
+        w = max(w, best_x + best_cw);
+        h = max(h, best_y + best_ch);
+        
+        w = align(w, 4);
+        h = align(h, 4);
+
+        // Resize bitmap if necessary.
+        if (uint(w) > m_bitmap.width() || uint(h) > m_bitmap.height())
+        {
+            //nvDebug("Resize bitmap (%d, %d).\n", nextPowerOfTwo(w), nextPowerOfTwo(h));
+            m_bitmap.resize(nextPowerOfTwo(U32(w)), nextPowerOfTwo(U32(h)), false);
+            m_debug_bitmap.resize(nextPowerOfTwo(U32(w)), nextPowerOfTwo(U32(h)));
+        }
+
+        //nvDebug("Add chart at (%d, %d).\n", best_x, best_y);
+
+        addChart(&chart_bitmap, w, h, best_x, best_y, best_r, /*debugOutput=*/NULL);
+
+        // IC: Output chart again to debug bitmap.
+        if (chart->isVertexMapped()) {
+            addChart(&chart_bitmap, w, h, best_x, best_y, best_r, &m_debug_bitmap);
+        }
+        else {
+            addChart(chart, w, h, best_x, best_y, best_r, &m_debug_bitmap);
+        }
+
+        //float best_angle = 2 * PI * best_r;
+
+        // Translate and rotate chart texture coordinates.
+        HalfEdge::Mesh * mesh = chart->chartMesh();
+        const uint vertexCount = mesh->vertexCount();
+        for (uint v = 0; v < vertexCount; v++)
+        {
+            HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+
+            Vector2 t = vertex->tex;
+            if (best_r) swap(t.x, t.y);
+            //vertex->tex.x = best_x + t.x * cosf(best_angle) - t.y * sinf(best_angle);
+            //vertex->tex.y = best_y + t.x * sinf(best_angle) + t.y * cosf(best_angle);
+
+            vertex->tex.x = best_x + t.x + 0.5f;
+            vertex->tex.y = best_y + t.y + 0.5f;
+
+            nvCheck(vertex->tex.x >= 0 && vertex->tex.y >= 0);
+			nvCheck(isFinite(vertex->tex.x) && isFinite(vertex->tex.y));
+        }
+
+#if DEBUG_OUTPUT && 0
+        StringBuilder fileName;
+        fileName.format("debug_packer_%d.tga", i);
+        //outputDebugBitmap(fileName.str(), m_bitmap, w, h);
+        outputDebugImage(fileName.str(), m_debug_bitmap, w, h);
+#endif
+    }
+
+#else // 0
+
+    // Add sorted charts to bitmap.
+    for (uint i = 0; i < chartCount; i++)
+    {
+        uint c = ranks[chartCount - i - 1]; // largest chart first
+
+        Chart * chart = m_atlas->chartAt(c);
+
+        if (!chart->isDisk()) continue;
+
+        Vector2 scale(1, 1);
+
+#if 0 // old method.
+        //m_padding_x = 2*padding;
+        //m_padding_y = 2*padding;
+#else
+        //m_padding_x = 0; //padding;
+        //m_padding_y = 0; //padding;
+#endif
+
+        int bw = ftoi_ceil(chartExtents[c].x + 1);
+        int bh = ftoi_ceil(chartExtents[c].y + 1);
+
+        if (chartExtents[c].x < 1.0f) {
+            scale.x = 0.01f; // @@ Ideally we would like to scale it to 0, but then our rasterizer would not touch any pixels.
+            bw = 1;
+        }
+        if (chartExtents[c].y < 1.0f) {
+            scale.y = 0.01f;
+            bh = 1;
+        }
+
+        //BitMap chart_bitmap(iceil(chartExtents[c].x) + 1 + m_padding_x * 2, iceil(chartExtents[c].y) + 1 + m_padding_y * 2);
+        //BitMap chart_bitmap(ftoi_ceil(chartExtents[c].x/2)*2, ftoi_ceil(chartExtents[c].y/2)*2);
+        BitMap chart_bitmap(bw, bh);
+        chart_bitmap.clearAll();
+        
+        Vector2 offset;
+        offset.x = 0; // (chart_bitmap.width() - chartExtents[c].x) * 0.5f;
+        offset.y = 0; // (chart_bitmap.height() - chartExtents[c].y) * 0.5f;
+
+        drawChartBitmap(chart, &chart_bitmap, scale, offset);
+
+        int best_x, best_y;
+        int best_cw, best_ch;
+        int best_r;
+        findChartLocation(quality, &chart_bitmap, chartExtents[c], w, h, &best_x, &best_y, &best_cw, &best_ch, &best_r);
+
+        /*if (w < best_x + best_cw || h < best_y + best_ch)
+        {
+            nvDebug("Resize extents to (%d, %d).\n", best_x + best_cw, best_y + best_ch);
+        }*/
+
+        // Update parametric extents.
+        w = max(w, best_x + best_cw);
+        h = max(h, best_y + best_ch);
+
+        // Resize bitmap if necessary.
+        if (uint(w) > m_bitmap.width() || uint(h) > m_bitmap.height())
+        {
+            //nvDebug("Resize bitmap (%d, %d).\n", nextPowerOfTwo(w), nextPowerOfTwo(h));
+            m_bitmap.resize(nextPowerOfTwo(w), nextPowerOfTwo(h), false);
+            m_debug_bitmap.resize(nextPowerOfTwo(w), nextPowerOfTwo(h));
+        }
+
+        //nvDebug("Add chart at (%d, %d).\n", best_x, best_y);
+
+#if 0 // old method.
+#if _DEBUG
+        checkCanAddChart(chart, w, h, best_x, best_y, best_r);
+#endif
+
+        // Add chart.
+        addChart(chart, w, h, best_x, best_y, best_r);
+#else
+        // Add chart reusing its bitmap.
+        addChart(&chart_bitmap, w, h, best_x, best_y, best_r);
+#endif
+
+        //float best_angle = 2 * PI * best_r;
+
+        // Translate and rotate chart texture coordinates.
+        HalfEdge::Mesh * mesh = chart->chartMesh();
+        const uint vertexCount = mesh->vertexCount();
+        for (uint v = 0; v < vertexCount; v++)
+        {
+            HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+
+            Vector2 t = vertex->tex * scale + offset;
+            if (best_r) swap(t.x, t.y);
+            //vertex->tex.x = best_x + t.x * cosf(best_angle) - t.y * sinf(best_angle);
+            //vertex->tex.y = best_y + t.x * sinf(best_angle) + t.y * cosf(best_angle);
+            vertex->tex.x = best_x + t.x + 0.5f;
+            vertex->tex.y = best_y + t.y + 0.5f;
+
+            nvCheck(vertex->tex.x >= 0 && vertex->tex.y >= 0);
+        }
+
+#if DEBUG_OUTPUT && 0
+        StringBuilder fileName;
+        fileName.format("debug_packer_%d.tga", i);
+        //outputDebugBitmap(fileName.str(), m_bitmap, w, h);
+        outputDebugImage(fileName.str(), m_debug_bitmap, w, h);
+#endif
+    }
+
+#endif // 0
+
+    //w -= padding - 1; // Leave one pixel border!
+    //h -= padding - 1;
+
+    m_width = max(0, w);
+    m_height = max(0, h);
+
+    nvCheck(isAligned(m_width, 4));
+    nvCheck(isAligned(m_height, 4));
+
+    m_debug_bitmap.resize(m_width, m_height);
+    m_debug_bitmap.setFormat(Image::Format_ARGB);
+
+#if DEBUG_OUTPUT
+    //outputDebugBitmap("debug_packer_final.tga", m_bitmap, w, h);
+    //outputDebugImage("debug_packer_final.tga", m_debug_bitmap, w, h);
+    ImageIO::save("debug_packer_final.tga", &m_debug_bitmap);
+#endif
+}
+
+
+// IC: Brute force is slow, and random may take too much time to converge. We start inserting large charts in a small atlas. Using brute force is lame, because most of the space 
+// is occupied at this point. At the end we have many small charts and a large atlas with sparse holes. Finding those holes randomly is slow. A better approach would be to 
+// start stacking large charts as if they were tetris pieces. Once charts get small try to place them randomly. It may be interesting to try a intermediate strategy, first try 
+// along one axis and then try exhaustively along that axis.
+void AtlasPacker::findChartLocation(int quality, const BitMap * bitmap, Vector2::Arg extents, int w, int h, int * best_x, int * best_y, int * best_w, int * best_h, int * best_r)
+{
+    int attempts = 256;
+    if (quality == 1) attempts = 4096;
+    if (quality == 2) attempts = 2048;
+    if (quality == 3) attempts = 1024;
+    if (quality == 4) attempts = 512;
+
+    if (quality == 0 || w*h < attempts)
+    {
+        findChartLocation_bruteForce(bitmap, extents, w, h, best_x, best_y, best_w, best_h, best_r);
+    }
+    else
+    {
+        findChartLocation_random(bitmap, extents, w, h, best_x, best_y, best_w, best_h, best_r, attempts);
+    }
+}
+
+#define BLOCK_SIZE 4
+
+void AtlasPacker::findChartLocation_bruteForce(const BitMap * bitmap, Vector2::Arg extents, int w, int h, int * best_x, int * best_y, int * best_w, int * best_h, int * best_r)
+{
+    int best_metric = INT_MAX;
+
+    // Try two different orientations.
+    for (int r = 0; r < 2; r++)
+    {
+        int cw = bitmap->width();
+        int ch = bitmap->height();
+        if (r & 1) swap(cw, ch);
+
+        for (int y = 0; y <= h + 1; y += BLOCK_SIZE) // + 1 to extend atlas in case atlas full.
+        {
+            for (int x = 0; x <= w + 1; x += BLOCK_SIZE) // + 1 not really necessary here.
+            {
+                // Early out.
+                int area = max(w, x+cw) * max(h, y+ch);
+                //int perimeter = max(w, x+cw) + max(h, y+ch);
+                int extents = max(max(w, x+cw), max(h, y+ch));
+
+                int metric = extents*extents + area;
+
+                if (metric > best_metric) {
+                    continue;
+                }
+                if (metric == best_metric && max(x, y) >= max(*best_x, *best_y)) {
+                    // If metric is the same, pick the one closest to the origin.
+                    continue;
+                }
+
+                if (canAddChart(bitmap, w, h, x, y, r))
+                {
+                    best_metric = metric;
+                    *best_x = x;
+                    *best_y = y;
+                    *best_w = cw;
+                    *best_h = ch;
+                    *best_r = r;
+
+                    if (area == w*h)
+                    {
+                        // Chart is completely inside, do not look at any other location.
+                        goto done;
+                    }
+                }
+            }
+        }
+    }
+
+done:
+    nvDebugCheck (best_metric != INT_MAX);
+}
+
+
+void AtlasPacker::findChartLocation_random(const BitMap * bitmap, Vector2::Arg extents, int w, int h, int * best_x, int * best_y, int * best_w, int * best_h, int * best_r, int minTrialCount)
+{
+    int best_metric = INT_MAX;
+
+    for (int i = 0; i < minTrialCount || best_metric == INT_MAX; i++)
+    {
+        int r = m_rand.getRange(1);
+        int x = m_rand.getRange(w + 1); // + 1 to extend atlas in case atlas full. We may want to use a higher number to increase probability of extending atlas.
+        int y = m_rand.getRange(h + 1); // + 1 to extend atlas in case atlas full.
+
+        x = align(x, BLOCK_SIZE);
+        y = align(y, BLOCK_SIZE);
+
+        int cw = bitmap->width();
+        int ch = bitmap->height();
+        if (r & 1) swap(cw, ch);
+
+        // Early out.
+        int area = max(w, x+cw) * max(h, y+ch);
+        //int perimeter = max(w, x+cw) + max(h, y+ch);
+        int extents = max(max(w, x+cw), max(h, y+ch));
+
+        int metric = extents*extents + area;
+
+        if (metric > best_metric) {
+            continue;
+        }
+        if (metric == best_metric && min(x, y) > min(*best_x, *best_y)) {
+            // If metric is the same, pick the one closest to the origin.
+            continue;
+        }
+
+        if (canAddChart(bitmap, w, h, x, y, r))
+        {
+            best_metric = metric;
+            *best_x = x;
+            *best_y = y;
+            *best_w = cw;
+            *best_h = ch;
+            *best_r = r;
+
+            if (area == w*h)
+            {
+                // Chart is completely inside, do not look at any other location.
+                break;
+            }
+        }
+    }
+}
+
+
+void AtlasPacker::drawChartBitmapDilate(const Chart * chart, BitMap * bitmap, int padding)
+{
+    const int w = bitmap->width();
+    const int h = bitmap->height();
+    const Vector2 extents = Vector2(float(w), float(h));
+    
+    // Rasterize chart faces, check that all bits are not set.
+    const uint faceCount = chart->faceCount();
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = chart->chartMesh()->faceAt(f);
+        
+        Vector2 vertices[4];
+
+        uint edgeCount = 0;
+        for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            if (edgeCount < 4)
+            {
+                vertices[edgeCount] = it.vertex()->tex + Vector2(0.5) + Vector2(float(padding), float(padding));
+            }
+            edgeCount++;
+        }
+
+        if (edgeCount == 3)
+        {
+            Raster::drawTriangle(Raster::Mode_Antialiased, extents, true, vertices, AtlasPacker::setBitsCallback, bitmap);
+        }
+        else
+        {
+            Raster::drawQuad(Raster::Mode_Antialiased, extents, true, vertices, AtlasPacker::setBitsCallback, bitmap);
+        }
+    }
+
+    // Expand chart by padding pixels. (dilation)
+    BitMap tmp(w, h);
+    for (int i = 0; i < padding; i++) {
+        tmp.clearAll();
+
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                bool b = bitmap->bitAt(x, y);
+                if (!b) {
+                    if (x > 0) {
+                        b |= bitmap->bitAt(x - 1, y);
+                        if (y > 0) b |= bitmap->bitAt(x - 1, y - 1);
+                        if (y < h-1) b |= bitmap->bitAt(x - 1, y + 1);
+                    }
+                    if (y > 0) b |= bitmap->bitAt(x, y - 1);
+                    if (y < h-1) b |= bitmap->bitAt(x, y + 1);
+                    if (x < w-1) {
+                        b |= bitmap->bitAt(x + 1, y);
+                        if (y > 0) b |= bitmap->bitAt(x + 1, y - 1);
+                        if (y < h-1) b |= bitmap->bitAt(x + 1, y + 1);
+                    }
+                }
+                if (b) tmp.setBitAt(x, y);
+            }
+        }
+
+        swap(tmp, *bitmap);
+    }
+}
+
+
+void AtlasPacker::drawChartBitmap(const Chart * chart, BitMap * bitmap, const Vector2 & scale, const Vector2 & offset)
+{
+    const int w = bitmap->width();
+    const int h = bitmap->height();
+    const Vector2 extents = Vector2(float(w), float(h));
+    
+    static const Vector2 pad[4] = {
+        Vector2(-0.5, -0.5),
+        Vector2(0.5, -0.5),
+        Vector2(-0.5, 0.5),
+        Vector2(0.5, 0.5)
+    };
+    /*static const Vector2 pad[4] = {
+        Vector2(-1, -1),
+        Vector2(1, -1),
+        Vector2(-1, 1),
+        Vector2(1, 1)
+    };*/
+
+    // Rasterize 4 times to add proper padding.
+    for (int i = 0; i < 4; i++) {
+
+        // Rasterize chart faces, check that all bits are not set.
+        const uint faceCount = chart->chartMesh()->faceCount();
+        for (uint f = 0; f < faceCount; f++)
+        {
+            const HalfEdge::Face * face = chart->chartMesh()->faceAt(f);
+            
+            Vector2 vertices[4];
+
+            uint edgeCount = 0;
+            for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+            {
+                if (edgeCount < 4)
+                {
+                    vertices[edgeCount] = it.vertex()->tex * scale + offset + pad[i];
+                    nvCheck(ftoi_ceil(vertices[edgeCount].x) >= 0);
+                    nvCheck(ftoi_ceil(vertices[edgeCount].y) >= 0);
+                    nvCheck(ftoi_ceil(vertices[edgeCount].x) <= w);
+                    nvCheck(ftoi_ceil(vertices[edgeCount].y) <= h);
+                }
+                edgeCount++;
+            }
+
+            if (edgeCount == 3)
+            {
+                Raster::drawTriangle(Raster::Mode_Antialiased, extents, /*enableScissors=*/true, vertices, AtlasPacker::setBitsCallback, bitmap);
+            }
+            else
+            {
+                Raster::drawQuad(Raster::Mode_Antialiased, extents, /*enableScissors=*/true, vertices, AtlasPacker::setBitsCallback, bitmap);
+            }
+        }
+    }
+
+    // @@ This only allows us to expand the size in texel intervals.
+    /*if (m_padding_x != 0 && m_padding_y != 0)*/ {
+
+        // Expand chart by padding pixels. (dilation)
+        BitMap tmp(w, h);
+        //for (int i = 0; i < 1; i++) {
+            tmp.clearAll();
+
+            for (int y = 0; y < h; y++) {
+                for (int x = 0; x < w; x++) {
+                    bool b = bitmap->bitAt(x, y);
+                    if (!b) {
+                        if (x > 0) {
+                            b |= bitmap->bitAt(x - 1, y);
+                            if (y > 0) b |= bitmap->bitAt(x - 1, y - 1);
+                            if (y < h-1) b |= bitmap->bitAt(x - 1, y + 1);
+                        }
+                        if (y > 0) b |= bitmap->bitAt(x, y - 1);
+                        if (y < h-1) b |= bitmap->bitAt(x, y + 1);
+                        if (x < w-1) {
+                            b |= bitmap->bitAt(x + 1, y);
+                            if (y > 0) b |= bitmap->bitAt(x + 1, y - 1);
+                            if (y < h-1) b |= bitmap->bitAt(x + 1, y + 1);
+                        }
+                    }
+                    if (b) tmp.setBitAt(x, y);
+                }
+            }
+
+            swap(tmp, *bitmap);
+        //}
+    }
+}
+
+bool AtlasPacker::canAddChart(const BitMap * bitmap, int atlas_w, int atlas_h, int offset_x, int offset_y, int r)
+{
+    nvDebugCheck(r == 0 || r == 1);
+
+    // Check whether the two bitmaps overlap.
+
+    const int w = bitmap->width();
+    const int h = bitmap->height();
+    
+    if (r == 0) {
+        for (int y = 0; y < h; y++) {
+            int yy = y + offset_y;
+            if (yy >= 0) {
+                for (int x = 0; x < w; x++) {
+                    int xx = x + offset_x;
+                    if (xx >= 0) {
+                        if (bitmap->bitAt(x, y)) {
+                            if (xx < atlas_w && yy < atlas_h) {
+                                if (m_bitmap.bitAt(xx, yy)) return false;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else if (r == 1) {
+        for (int y = 0; y < h; y++) {
+            int xx = y + offset_x;
+            if (xx >= 0) {
+                for (int x = 0; x < w; x++) {
+                    int yy = x + offset_y;
+                    if (yy >= 0) {
+                        if (bitmap->bitAt(x, y)) {
+                            if (xx < atlas_w && yy < atlas_h) {
+                                if (m_bitmap.bitAt(xx, yy)) return false;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    
+    return true;
+}
+
+#if 0
+void AtlasPacker::checkCanAddChart(const Chart * chart, int w, int h, int x, int y, int r)
+{
+    nvDebugCheck(r == 0 || r == 1);
+    Vector2 extents = Vector2(float(w), float(h));
+    Vector2 offset = Vector2(float(x), float(y));
+
+    // Rasterize chart faces, set bits.
+    const uint faceCount = chart->faceCount();
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = chart->chartMesh()->faceAt(f);
+        
+        Vector2 vertices[4];
+
+        uint edgeCount = 0;
+        for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            if (edgeCount < 4)
+            {
+                Vector2 t = it.vertex()->tex;
+                if (r == 1) swap(t.x, t.y);
+                vertices[edgeCount] = t + offset;
+            }
+            edgeCount++;
+        }
+
+        if (edgeCount == 3)
+        {
+            Raster::drawTriangle(Raster::Mode_Antialiased, extents, /*enableScissors=*/true, vertices, AtlasPacker::checkBitsCallback, &m_bitmap);
+        }
+        else
+        {
+            Raster::drawQuad(Raster::Mode_Antialiased, extents, /*enableScissors=*/true, vertices, AtlasPacker::checkBitsCallback, &m_bitmap);
+        }
+    }
+}
+#endif // 0
+
+
+static Color32 chartColor = Color32(0);
+static void selectRandomColor(MTRand & rand) {
+    // Pick random color for this chart. @@ Select random hue, but fixed saturation/luminance?
+    chartColor.r = 128 + rand.getRange(127);
+    chartColor.g = 128 + rand.getRange(127);
+    chartColor.b = 128 + rand.getRange(127);
+    chartColor.a = 255;
+}
+static bool debugDrawCallback(void * param, int x, int y, Vector3::Arg, Vector3::Arg, Vector3::Arg, float area)
+{
+    Image * image = (Image *)param;
+
+    if (area > 0.0) {
+        Color32 c = image->pixel(x, y);
+        c.r = chartColor.r;
+        c.g = chartColor.g;
+        c.b = chartColor.b;
+        c.a += U8(ftoi_round(0.5f * area * 255));
+        image->pixel(x, y) = c;
+    }
+
+    return true;
+}
+
+void AtlasPacker::addChart(const Chart * chart, int w, int h, int x, int y, int r, Image * debugOutput)
+{
+    nvDebugCheck(r == 0 || r == 1);
+
+    nvDebugCheck(debugOutput != NULL);
+    selectRandomColor(m_rand);
+
+    Vector2 extents = Vector2(float(w), float(h));
+    Vector2 offset = Vector2(float(x), float(y)) + Vector2(0.5);
+
+    // Rasterize chart faces, set bits.
+    const uint faceCount = chart->faceCount();
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = chart->chartMesh()->faceAt(f);
+        
+        Vector2 vertices[4];
+
+        uint edgeCount = 0;
+        for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            if (edgeCount < 4)
+            {
+                Vector2 t = it.vertex()->tex;
+                if (r == 1) swap(t.x, t.y);
+                vertices[edgeCount] = t + offset;
+            }
+            edgeCount++;
+        }
+
+        if (edgeCount == 3)
+        {
+            Raster::drawTriangle(Raster::Mode_Antialiased, extents, /*enableScissors=*/true, vertices, debugDrawCallback, debugOutput);
+        }
+        else
+        {
+            Raster::drawQuad(Raster::Mode_Antialiased, extents, /*enableScissors=*/true, vertices, debugDrawCallback, debugOutput);
+        }
+    }
+}
+
+
+void AtlasPacker::addChart(const BitMap * bitmap, int atlas_w, int atlas_h, int offset_x, int offset_y, int r, Image * debugOutput)
+{
+    nvDebugCheck(r == 0 || r == 1);
+
+    // Check whether the two bitmaps overlap.
+
+    const int w = bitmap->width();
+    const int h = bitmap->height();
+
+    if (debugOutput != NULL) {
+        selectRandomColor(m_rand);
+    }
+
+    if (r == 0) {
+        for (int y = 0; y < h; y++) {
+            int yy = y + offset_y;
+            if (yy >= 0) {
+                for (int x = 0; x < w; x++) {
+                    int xx = x + offset_x;
+                    if (xx >= 0) {
+                        if (bitmap->bitAt(x, y)) {
+                            if (xx < atlas_w && yy < atlas_h) {
+                                if (debugOutput) debugOutput->pixel(xx, yy) = chartColor;
+                                else {
+                                    nvDebugCheck(m_bitmap.bitAt(xx, yy) == false);
+                                    m_bitmap.setBitAt(xx, yy);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else if (r == 1) {
+        for (int y = 0; y < h; y++) {
+            int xx = y + offset_x;
+            if (xx >= 0) {
+                for (int x = 0; x < w; x++) {
+                    int yy = x + offset_y;
+                    if (yy >= 0) {
+                        if (bitmap->bitAt(x, y)) {
+                            if (xx < atlas_w && yy < atlas_h) {
+                                if (debugOutput) debugOutput->pixel(xx, yy) = chartColor;
+                                else {
+                                    nvDebugCheck(m_bitmap.bitAt(xx, yy) == false);
+                                    m_bitmap.setBitAt(xx, yy);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+
+/*static*/ bool AtlasPacker::checkBitsCallback(void * param, int x, int y, Vector3::Arg, Vector3::Arg, Vector3::Arg, float)
+{
+    BitMap * bitmap = (BitMap * )param;
+
+    nvDebugCheck(bitmap->bitAt(x, y) == false);
+
+    return true;
+}
+
+/*static*/ bool AtlasPacker::setBitsCallback(void * param, int x, int y, Vector3::Arg, Vector3::Arg, Vector3::Arg, float area)
+{
+    BitMap * bitmap = (BitMap * )param;
+
+    if (area > 0.0) {
+        bitmap->setBitAt(x, y);
+    }
+
+    return true;
+}
+
+
+
+float AtlasPacker::computeAtlasUtilization() const {
+    const uint w = m_width;
+    const uint h = m_height;
+    nvDebugCheck(w <= m_bitmap.width());
+    nvDebugCheck(h <= m_bitmap.height());
+
+    uint count = 0;
+    for (uint y = 0; y < h; y++) {
+        for (uint x = 0; x < w; x++) {
+            count += m_bitmap.bitAt(x, y);
+        }
+    }
+
+    return float(count) / (w * h);
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/param/AtlasPacker.h b/thirdparty/thekla_atlas/nvmesh/param/AtlasPacker.h
new file mode 100644
index 0000000000..2d305f38cd
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/AtlasPacker.h
@@ -0,0 +1,63 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MESH_ATLASPACKER_H
+#define NV_MESH_ATLASPACKER_H
+
+#include "nvcore/RadixSort.h"
+#include "nvmath/Vector.h"
+#include "nvmath/Random.h"
+#include "nvimage/BitMap.h"
+#include "nvimage/Image.h"
+
+#include "nvmesh/nvmesh.h"
+
+
+namespace nv
+{
+    class Atlas;
+    class Chart;
+
+    struct AtlasPacker
+    {
+        AtlasPacker(Atlas * atlas);
+        ~AtlasPacker();
+
+        void packCharts(int quality, float texelArea, bool blockAligned, bool conservative);
+        float computeAtlasUtilization() const;
+
+    private:
+
+        void findChartLocation(int quality, const BitMap * bitmap, Vector2::Arg extents, int w, int h, int * best_x, int * best_y, int * best_w, int * best_h, int * best_r);
+        void findChartLocation_bruteForce(const BitMap * bitmap, Vector2::Arg extents, int w, int h, int * best_x, int * best_y, int * best_w, int * best_h, int * best_r);
+        void findChartLocation_random(const BitMap * bitmap, Vector2::Arg extents, int w, int h, int * best_x, int * best_y, int * best_w, int * best_h, int * best_r, int minTrialCount);
+
+        void drawChartBitmapDilate(const Chart * chart, BitMap * bitmap, int padding);
+        void drawChartBitmap(const Chart * chart, BitMap * bitmap, const Vector2 & scale, const Vector2 & offset);
+        
+        bool canAddChart(const BitMap * bitmap, int w, int h, int x, int y, int r);
+        void addChart(const BitMap * bitmap, int w, int h, int x, int y, int r, Image * debugOutput);
+        //void checkCanAddChart(const Chart * chart, int w, int h, int x, int y, int r);
+        void addChart(const Chart * chart, int w, int h, int x, int y, int r, Image * debugOutput);
+        
+
+        static bool checkBitsCallback(void * param, int x, int y, Vector3::Arg bar, Vector3::Arg dx, Vector3::Arg dy, float coverage);
+        static bool setBitsCallback(void * param, int x, int y, Vector3::Arg bar, Vector3::Arg dx, Vector3::Arg dy, float coverage);
+
+    private:
+
+        Atlas * m_atlas;
+        BitMap m_bitmap;
+        Image m_debug_bitmap;
+        RadixSort m_radix;
+
+        uint m_width;
+        uint m_height;
+        
+        MTRand m_rand;
+       
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_ATLASPACKER_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/LeastSquaresConformalMap.cpp b/thirdparty/thekla_atlas/nvmesh/param/LeastSquaresConformalMap.cpp
new file mode 100644
index 0000000000..cd1e8bbb7b
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/LeastSquaresConformalMap.cpp
@@ -0,0 +1,483 @@
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
+
+#include "nvmesh.h" // pch
+
+#include "LeastSquaresConformalMap.h"
+#include "ParameterizationQuality.h"
+#include "Util.h"
+
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Vertex.h"
+#include "nvmesh/halfedge/Face.h"
+
+#include "nvmath/Sparse.h"
+#include "nvmath/Solver.h"
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Array.inl"
+
+
+using namespace nv;
+using namespace HalfEdge;
+
+namespace
+{
+
+    // Test all pairs of vertices in the boundary and check distance.
+    static void findDiameterVertices(HalfEdge::Mesh * mesh, HalfEdge::Vertex ** a, HalfEdge::Vertex ** b)
+    {
+        nvDebugCheck(mesh != NULL);
+        nvDebugCheck(a != NULL);
+        nvDebugCheck(b != NULL);
+
+        const uint vertexCount = mesh->vertexCount();
+
+        float maxLength = 0.0f;
+
+        for (uint v0 = 1; v0 < vertexCount; v0++)
+        {
+            HalfEdge::Vertex * vertex0 = mesh->vertexAt(v0);
+            nvDebugCheck(vertex0 != NULL);
+
+            if (!vertex0->isBoundary()) continue;
+
+            for (uint v1 = 0; v1 < v0; v1++)
+            {
+                HalfEdge::Vertex * vertex1 = mesh->vertexAt(v1);
+                nvDebugCheck(vertex1 != NULL);
+
+                if (!vertex1->isBoundary()) continue;
+
+                float len = length(vertex0->pos - vertex1->pos);
+
+                if (len > maxLength)
+                {
+                    maxLength = len;
+
+                    *a = vertex0;
+                    *b = vertex1;
+                }
+            }
+        }
+
+        nvDebugCheck(*a != NULL && *b != NULL);
+    }
+
+    // Fast sweep in 3 directions
+    static bool findApproximateDiameterVertices(HalfEdge::Mesh * mesh, HalfEdge::Vertex ** a, HalfEdge::Vertex ** b)
+    {
+        nvDebugCheck(mesh != NULL);
+        nvDebugCheck(a != NULL);
+        nvDebugCheck(b != NULL);
+
+        const uint vertexCount = mesh->vertexCount();
+
+        HalfEdge::Vertex * minVertex[3];
+        HalfEdge::Vertex * maxVertex[3];
+
+        minVertex[0] = minVertex[1] = minVertex[2] = NULL;
+        maxVertex[0] = maxVertex[1] = maxVertex[2] = NULL;
+
+        for (uint v = 1; v < vertexCount; v++)
+        {
+            HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+            nvDebugCheck(vertex != NULL);
+
+            if (vertex->isBoundary())
+            {
+                minVertex[0] = minVertex[1] = minVertex[2] = vertex;
+                maxVertex[0] = maxVertex[1] = maxVertex[2] = vertex;
+                break;
+            }
+        }
+
+        if (minVertex[0] == NULL)
+        {
+            // Input mesh has not boundaries.
+            return false;
+        }
+
+        for (uint v = 1; v < vertexCount; v++)
+        {
+            HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+            nvDebugCheck(vertex != NULL);
+
+            if (!vertex->isBoundary())
+            {
+                // Skip interior vertices.
+                continue;
+            }
+
+            if (vertex->pos.x < minVertex[0]->pos.x) minVertex[0] = vertex;
+            else if (vertex->pos.x > maxVertex[0]->pos.x) maxVertex[0] = vertex;
+
+            if (vertex->pos.y < minVertex[1]->pos.y) minVertex[1] = vertex;
+            else if (vertex->pos.y > maxVertex[1]->pos.y) maxVertex[1] = vertex;
+
+            if (vertex->pos.z < minVertex[2]->pos.z) minVertex[2] = vertex;
+            else if (vertex->pos.z > maxVertex[2]->pos.z) maxVertex[2] = vertex;
+        }
+
+        float lengths[3];
+        for (int i = 0; i < 3; i++)
+        {
+            lengths[i] = length(minVertex[i]->pos - maxVertex[i]->pos);
+        }
+
+        if (lengths[0] > lengths[1] && lengths[0] > lengths[2])
+        {
+            *a = minVertex[0];
+            *b = maxVertex[0];
+        }
+        else if (lengths[1] > lengths[2])
+        {
+            *a = minVertex[1];
+            *b = maxVertex[1];
+        }
+        else
+        {
+            *a = minVertex[2];
+            *b = maxVertex[2];
+        }
+
+        return true;
+    }
+
+    // Conformal relations from Bruno Levy:
+
+    // Computes the coordinates of the vertices of a triangle
+    // in a local 2D orthonormal basis of the triangle's plane.
+    static void project_triangle(Vector3::Arg p0, Vector3::Arg p1, Vector3::Arg p2, Vector2 * z0, Vector2 * z1, Vector2 * z2)
+    {
+        Vector3 X = normalize(p1 - p0, 0.0f);
+        Vector3 Z = normalize(cross(X, (p2 - p0)), 0.0f);
+        Vector3 Y = normalize(cross(Z, X), 0.0f);
+
+        float x0 = 0.0f;
+        float y0 = 0.0f;
+        float x1 = length(p1 - p0);
+        float y1 = 0.0f;
+        float x2 = dot((p2 - p0), X);
+        float y2 = dot((p2 - p0), Y);
+
+        *z0 = Vector2(x0, y0);
+        *z1 = Vector2(x1, y1);
+        *z2 = Vector2(x2, y2);
+    }
+
+    // LSCM equation, geometric form :
+    // (Z1 - Z0)(U2 - U0) = (Z2 - Z0)(U1 - U0)
+    // Where Uk = uk + i.vk is the complex number 
+    //                       corresponding to (u,v) coords
+    //       Zk = xk + i.yk is the complex number 
+    //                       corresponding to local (x,y) coords
+    // cool: no divide with this expression,
+    //  makes it more numerically stable in
+    //  the presence of degenerate triangles.
+
+    static void setup_conformal_map_relations(SparseMatrix & A, int row, const HalfEdge::Vertex * v0, const HalfEdge::Vertex * v1, const HalfEdge::Vertex * v2)
+    {
+        int id0 = v0->id;
+        int id1 = v1->id;
+        int id2 = v2->id;
+
+        Vector3 p0 = v0->pos;
+        Vector3 p1 = v1->pos;
+        Vector3 p2 = v2->pos;
+
+        Vector2 z0, z1, z2;
+        project_triangle(p0, p1, p2, &z0, &z1, &z2);
+
+        Vector2 z01 = z1 - z0;
+        Vector2 z02 = z2 - z0;
+
+        float a = z01.x;
+        float b = z01.y;
+        float c = z02.x;
+        float d = z02.y;
+        nvCheck(b == 0.0f);
+
+        // Note  : 2*id + 0 --> u
+        //         2*id + 1 --> v
+        int u0_id = 2 * id0 + 0;
+        int v0_id = 2 * id0 + 1;
+        int u1_id = 2 * id1 + 0;
+        int v1_id = 2 * id1 + 1;
+        int u2_id = 2 * id2 + 0;
+        int v2_id = 2 * id2 + 1;
+
+        // Note : b = 0
+
+        // Real part
+        A.setCoefficient(u0_id, 2 * row + 0, -a+c);
+        A.setCoefficient(v0_id, 2 * row + 0,  b-d);
+        A.setCoefficient(u1_id, 2 * row + 0,   -c);
+        A.setCoefficient(v1_id, 2 * row + 0,    d);
+        A.setCoefficient(u2_id, 2 * row + 0,    a);
+
+        // Imaginary part
+        A.setCoefficient(u0_id, 2 * row + 1, -b+d);
+        A.setCoefficient(v0_id, 2 * row + 1, -a+c);
+        A.setCoefficient(u1_id, 2 * row + 1,   -d);
+        A.setCoefficient(v1_id, 2 * row + 1,   -c);
+        A.setCoefficient(v2_id, 2 * row + 1,    a);
+    }
+
+
+    // Conformal relations from Brecht Van Lommel (based on ABF):
+
+    static float vec_angle_cos(Vector3::Arg v1, Vector3::Arg v2, Vector3::Arg v3)
+    {
+        Vector3 d1 = v1 - v2;
+        Vector3 d2 = v3 - v2;
+        return clamp(dot(d1, d2) / (length(d1) * length(d2)), -1.0f, 1.0f);
+    }
+
+    static float vec_angle(Vector3::Arg v1, Vector3::Arg v2, Vector3::Arg v3)
+    {
+        float dot = vec_angle_cos(v1, v2, v3);
+        return acosf(dot);
+    }
+
+    static void triangle_angles(Vector3::Arg v1, Vector3::Arg v2, Vector3::Arg v3, float *a1, float *a2, float *a3)
+    {
+        *a1 = vec_angle(v3, v1, v2);
+        *a2 = vec_angle(v1, v2, v3);
+        *a3 = PI - *a2 - *a1;
+    }
+
+    static void triangle_cosines(Vector3::Arg v1, Vector3::Arg v2, Vector3::Arg v3, float *a1, float *a2, float *a3)
+    {
+        *a1 = vec_angle_cos(v3, v1, v2);
+        *a2 = vec_angle_cos(v1, v2, v3);
+        *a3 = vec_angle_cos(v2, v3, v1);
+    }
+
+    static void setup_abf_relations(SparseMatrix & A, int row, const HalfEdge::Vertex * v0, const HalfEdge::Vertex * v1, const HalfEdge::Vertex * v2)
+    {
+        int id0 = v0->id;
+        int id1 = v1->id;
+        int id2 = v2->id;
+
+        Vector3 p0 = v0->pos;
+        Vector3 p1 = v1->pos;
+        Vector3 p2 = v2->pos;
+
+#if 1
+        // @@ IC: Wouldn't it be more accurate to return cos and compute 1-cos^2?
+        // It does indeed seem to be a little bit more robust.
+        // @@ Need to revisit this more carefully!
+
+        float a0, a1, a2;
+        triangle_angles(p0, p1, p2, &a0, &a1, &a2);
+
+        float s0 = sinf(a0);
+        float s1 = sinf(a1);
+        float s2 = sinf(a2);
+
+        /*// Hack for degenerate triangles.
+        if (equal(s0, 0) && equal(s1, 0) && equal(s2, 0)) {
+            if (equal(a0, 0)) a0 += 0.001f;
+            if (equal(a1, 0)) a1 += 0.001f;
+            if (equal(a2, 0)) a2 += 0.001f;
+
+            if (equal(a0, PI)) a0 = PI - a1 - a2;
+            if (equal(a1, PI)) a1 = PI - a0 - a2;
+            if (equal(a2, PI)) a2 = PI - a0 - a1;
+
+            s0 = sinf(a0);
+            s1 = sinf(a1);
+            s2 = sinf(a2);
+        }*/
+
+        if (s1 > s0 && s1 > s2)
+        {
+            swap(s1, s2);
+            swap(s0, s1);
+
+            swap(a1, a2);
+            swap(a0, a1);
+
+            swap(id1, id2);
+            swap(id0, id1);
+        }
+        else if (s0 > s1 && s0 > s2)
+        {
+            swap(s0, s2);
+            swap(s0, s1);
+
+            swap(a0, a2);
+            swap(a0, a1);
+
+            swap(id0, id2);
+            swap(id0, id1);
+        }
+
+        float c0 = cosf(a0);
+#else
+        float c0, c1, c2;
+        triangle_cosines(p0, p1, p2, &c0, &c1, &c2);
+
+        float s0 = 1 - c0*c0;
+        float s1 = 1 - c1*c1;
+        float s2 = 1 - c2*c2;
+
+        nvDebugCheck(s0 != 0 || s1 != 0 || s2 != 0);
+
+        if (s1 > s0 && s1 > s2)
+        {
+            swap(s1, s2);
+            swap(s0, s1);
+
+            swap(c1, c2);
+            swap(c0, c1);
+
+            swap(id1, id2);
+            swap(id0, id1);
+        }
+        else if (s0 > s1 && s0 > s2)
+        {
+            swap(s0, s2);
+            swap(s0, s1);
+
+            swap(c0, c2);
+            swap(c0, c1);
+
+            swap(id0, id2);
+            swap(id0, id1);
+        }
+#endif
+
+        float ratio = (s2 == 0.0f) ? 1.0f: s1/s2;
+        float cosine = c0 * ratio;
+        float sine = s0 * ratio;
+
+        // Note  : 2*id + 0 --> u
+        //         2*id + 1 --> v
+        int u0_id = 2 * id0 + 0;
+        int v0_id = 2 * id0 + 1;
+        int u1_id = 2 * id1 + 0;
+        int v1_id = 2 * id1 + 1;
+        int u2_id = 2 * id2 + 0;
+        int v2_id = 2 * id2 + 1;
+
+        // Real part
+        A.setCoefficient(u0_id, 2 * row + 0, cosine - 1.0f);
+        A.setCoefficient(v0_id, 2 * row + 0, -sine);
+        A.setCoefficient(u1_id, 2 * row + 0, -cosine);
+        A.setCoefficient(v1_id, 2 * row + 0, sine);
+        A.setCoefficient(u2_id, 2 * row + 0, 1);
+
+        // Imaginary part
+        A.setCoefficient(u0_id, 2 * row + 1, sine);
+        A.setCoefficient(v0_id, 2 * row + 1, cosine - 1.0f);
+        A.setCoefficient(u1_id, 2 * row + 1, -sine);
+        A.setCoefficient(v1_id, 2 * row + 1, -cosine);
+        A.setCoefficient(v2_id, 2 * row + 1, 1);
+    }
+
+} // namespace
+
+
+bool nv::computeLeastSquaresConformalMap(HalfEdge::Mesh * mesh)
+{
+    nvDebugCheck(mesh != NULL);
+
+    // For this to work properly, mesh should not have colocals that have the same 
+    // attributes, unless you want the vertices to actually have different texcoords.
+
+    const uint vertexCount = mesh->vertexCount();
+    const uint D = 2 * vertexCount;
+    const uint N = 2 * countMeshTriangles(mesh);
+
+    // N is the number of equations (one per triangle)
+    // D is the number of variables (one per vertex; there are 2 pinned vertices).
+	if (N < D - 4) {
+		return false;
+	}
+
+    SparseMatrix A(D, N);
+    FullVector b(N);
+    FullVector x(D);
+
+    // Fill b:
+    b.fill(0.0f);
+
+    // Fill x:
+    HalfEdge::Vertex * v0;
+    HalfEdge::Vertex * v1;
+    if (!findApproximateDiameterVertices(mesh, &v0, &v1))
+    {
+        // Mesh has no boundaries.
+        return false;
+    }
+    if (v0->tex == v1->tex)
+    {
+        // LSCM expects an existing parameterization.
+        return false;
+    }
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+        nvDebugCheck(vertex != NULL);
+
+        // Initial solution.
+        x[2 * v + 0] = vertex->tex.x;
+        x[2 * v + 1] = vertex->tex.y;
+    }
+
+    // Fill A:
+    const uint faceCount = mesh->faceCount();
+    for (uint f = 0, t = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = mesh->faceAt(f);
+        nvDebugCheck(face != NULL);
+        nvDebugCheck(face->edgeCount() == 3);
+
+        const HalfEdge::Vertex * vertex0 = NULL;
+
+        for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Edge * edge = it.current();
+            nvCheck(edge != NULL);
+
+            if (vertex0 == NULL)
+            {
+                vertex0 = edge->vertex;
+            }
+            else if (edge->next->vertex != vertex0)
+            {
+                const HalfEdge::Vertex * vertex1 = edge->from();
+                const HalfEdge::Vertex * vertex2 = edge->to();
+
+                setup_abf_relations(A, t, vertex0, vertex1, vertex2);
+                //setup_conformal_map_relations(A, t, vertex0, vertex1, vertex2);
+
+                t++;
+            }
+        }
+    }
+
+    const uint lockedParameters[] =
+    {
+        2 * v0->id + 0,
+        2 * v0->id + 1,
+        2 * v1->id + 0,
+        2 * v1->id + 1
+    };
+
+    // Solve
+    LeastSquaresSolver(A, b, x, lockedParameters, 4, 0.000001f);
+
+    // Map x back to texcoords:
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+        nvDebugCheck(vertex != NULL);
+
+        vertex->tex = Vector2(x[2 * v + 0], x[2 * v + 1]);
+    }
+
+    return true;
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/param/LeastSquaresConformalMap.h b/thirdparty/thekla_atlas/nvmesh/param/LeastSquaresConformalMap.h
new file mode 100644
index 0000000000..51fbf193c8
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/LeastSquaresConformalMap.h
@@ -0,0 +1,15 @@
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
+
+#pragma once
+#ifndef NV_MESH_LEASTSQUARESCONFORMALMAP_H
+#define NV_MESH_LEASTSQUARESCONFORMALMAP_H
+
+namespace nv
+{
+    namespace HalfEdge { class Mesh; }
+
+    bool computeLeastSquaresConformalMap(HalfEdge::Mesh * mesh);
+
+} // nv namespace
+
+#endif // NV_MESH_LEASTSQUARESCONFORMALMAP_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/OrthogonalProjectionMap.cpp b/thirdparty/thekla_atlas/nvmesh/param/OrthogonalProjectionMap.cpp
new file mode 100644
index 0000000000..d6e5e30561
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/OrthogonalProjectionMap.cpp
@@ -0,0 +1,99 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "nvmesh.h" // pch
+
+#include "OrthogonalProjectionMap.h"
+
+#include "nvcore/Array.inl"
+
+#include "nvmath/Fitting.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Box.inl"
+#include "nvmath/Plane.inl"
+
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Vertex.h"
+#include "nvmesh/halfedge/Face.h"
+#include "nvmesh/geometry/Bounds.h"
+
+
+using namespace nv;
+
+bool nv::computeOrthogonalProjectionMap(HalfEdge::Mesh * mesh)
+{
+    Vector3 axis[2];
+
+#if 1
+
+    uint vertexCount = mesh->vertexCount();
+    Array<Vector3> points(vertexCount);
+    points.resize(vertexCount);
+
+    for (uint i = 0; i < vertexCount; i++)
+    {
+        points[i] = mesh->vertexAt(i)->pos;
+    }
+
+#if 0
+    axis[0] = Fit::computePrincipalComponent_EigenSolver(vertexCount, points.buffer());
+    axis[0] = normalize(axis[0]);
+
+    Plane plane = Fit::bestPlane(vertexCount, points.buffer());
+
+    Vector3 n = plane.vector();
+
+    axis[1] = cross(axis[0], n);
+    axis[1] = normalize(axis[1]);
+#else
+    // Avoid redundant computations.
+    float matrix[6];
+    Fit::computeCovariance(vertexCount, points.buffer(), matrix);
+
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) {
+        return false;
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!nv::Fit::eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        return false;
+    }
+
+    axis[0] = normalize(eigenVectors[0]);
+    axis[1] = normalize(eigenVectors[1]);
+#endif
+
+
+#else
+
+    // IC: I thought this was generally more robust, but turns out it's not even guaranteed to return a valid projection. Imagine a narrow quad perpendicular to one plane, but rotated so that the shortest axis of 
+    // the bounding box is in the direction of that plane.
+
+    // Use the shortest box axis
+    Box box = MeshBounds::box(mesh);
+    Vector3 dir = box.extents();
+
+    if (fabs(dir.x) <= fabs(dir.y) && fabs(dir.x) <= fabs(dir.z)) {
+        axis[0] = Vector3(0, 1, 0); 
+        axis[1] = Vector3(0, 0, 1);
+    }
+    else if (fabs(dir.y) <= fabs(dir.z)) {
+        axis[0] = Vector3(1, 0, 0); 
+        axis[1] = Vector3(0, 0, 1);
+    }
+    else {
+        axis[0] = Vector3(1, 0, 0); 
+        axis[1] = Vector3(0, 1, 0);
+    }
+#endif
+
+    // Project vertices to plane.
+    for (HalfEdge::Mesh::VertexIterator it(mesh->vertices()); !it.isDone(); it.advance())
+    {
+        HalfEdge::Vertex * vertex = it.current();
+        vertex->tex.x = dot(axis[0], vertex->pos);
+        vertex->tex.y = dot(axis[1], vertex->pos);
+    }
+
+    return true;
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/param/OrthogonalProjectionMap.h b/thirdparty/thekla_atlas/nvmesh/param/OrthogonalProjectionMap.h
new file mode 100644
index 0000000000..54920413d5
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/OrthogonalProjectionMap.h
@@ -0,0 +1,15 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MESH_ORTHOGONALPROJECTIONMAP_H
+#define NV_MESH_ORTHOGONALPROJECTIONMAP_H
+
+namespace nv
+{
+    namespace HalfEdge { class Mesh; }
+
+    bool computeOrthogonalProjectionMap(HalfEdge::Mesh * mesh);
+
+} // nv namespace
+
+#endif // NV_MESH_ORTHOGONALPROJECTIONMAP_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/ParameterizationQuality.cpp b/thirdparty/thekla_atlas/nvmesh/param/ParameterizationQuality.cpp
new file mode 100644
index 0000000000..683ee603cd
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/ParameterizationQuality.cpp
@@ -0,0 +1,323 @@
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
+
+#include "nvmesh.h" // pch
+
+#include "ParameterizationQuality.h"
+
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Face.h"
+#include "nvmesh/halfedge/Vertex.h"
+#include "nvmesh/halfedge/Edge.h"
+
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Debug.h"
+
+#include <float.h>
+
+
+using namespace nv;
+
+#if 0
+/*
+float triangleConformalEnergy(Vector3 q[3], Vector2 p[3])
+{
+const Vector3 v1 = q[0];
+const Vector3 v2 = q[1];
+const Vector3 v3 = q[2];
+
+const Vector2 w1 = p[0];
+const Vector2 w2 = p[1];
+const Vector2 w3 = p[2];
+
+float x1 = v2.x() - v1.x();
+float x2 = v3.x() - v1.x();
+float y1 = v2.y() - v1.y();
+float y2 = v3.y() - v1.y();
+float z1 = v2.z() - v1.z();
+float z2 = v3.z() - v1.z();
+
+float s1 = w2.x() - w1.x();
+float s2 = w3.x() - w1.x();
+float t1 = w2.y() - w1.y();
+float t2 = w3.y() - w1.y();
+
+float r = 1.0f / (s1 * t2 - s2 * t1);
+Vector3 sdir((t2 * x1 - t1 * x2) * r, (t2 * y1 - t1 * y2) * r, (t2 * z1 - t1 * z2) * r);
+Vector3 tdir((s1 * x2 - s2 * x1) * r, (s1 * y2 - s2 * y1) * r, (s1 * z2 - s2 * z1) * r);
+
+Vector3 N = cross(v3-v1, v2-v1);
+
+// Rotate 90 around N.
+}
+*/
+
+static float triangleConformalEnergy(Vector3 q[3], Vector2 p[3])
+{
+    // Using Denis formulas:
+    Vector3 c0 = q[1] - q[2];
+    Vector3 c1 = q[2] - q[0];
+    Vector3 c2 = q[0] - q[1];
+
+    Vector3 N = cross(-c0, c1);
+    float T = length(N);	// 2T
+    N = normalize(N, 0);
+
+    float cot_alpha0 = dot(-c1, c2) / length(cross(-c1, c2));
+    float cot_alpha1 = dot(-c2, c0) / length(cross(-c2, c0));
+    float cot_alpha2 = dot(-c0, c1) / length(cross(-c0, c1));
+
+    Vector3 t0 = -cot_alpha1 * c1 + cot_alpha2 * c2;
+    Vector3 t1 = -cot_alpha2 * c2 + cot_alpha0 * c0;
+    Vector3 t2 = -cot_alpha0 * c0 + cot_alpha1 * c1;
+
+    nvCheck(equal(length(t0), length(c0)));
+    nvCheck(equal(length(t1), length(c1)));
+    nvCheck(equal(length(t2), length(c2)));
+    nvCheck(equal(dot(t0, c0), 0));
+    nvCheck(equal(dot(t1, c1), 0));
+    nvCheck(equal(dot(t2, c2), 0));
+
+    // Gradients
+    Vector3 grad_u = 1.0f / T * (p[0].x * t0 + p[1].x * t1 + p[2].x * t2);
+    Vector3 grad_v = 1.0f / T * (p[0].y * t0 + p[1].y * t1 + p[2].y * t2);
+
+    // Rotated gradients
+    Vector3 Jgrad_u = 1.0f / T * (p[0].x * c0 + p[1].x * c1 + p[2].x * c2);
+    Vector3 Jgrad_v = 1.0f / T * (p[0].y * c0 + p[1].y * c1 + p[2].y * c2);
+
+    // Using Lengyel's formulas:
+    { 
+        const Vector3 v1 = q[0];
+        const Vector3 v2 = q[1];
+        const Vector3 v3 = q[2];
+
+        const Vector2 w1 = p[0];
+        const Vector2 w2 = p[1];
+        const Vector2 w3 = p[2];
+
+        float x1 = v2.x - v1.x;
+        float x2 = v3.x - v1.x;
+        float y1 = v2.y - v1.y;
+        float y2 = v3.y - v1.y;
+        float z1 = v2.z - v1.z;
+        float z2 = v3.z - v1.z;
+
+        float s1 = w2.x - w1.x;
+        float s2 = w3.x - w1.x;
+        float t1 = w2.y - w1.y;
+        float t2 = w3.y - w1.y;
+
+        float r = 1.0f / (s1 * t2 - s2 * t1);
+        Vector3 sdir((t2 * x1 - t1 * x2) * r, (t2 * y1 - t1 * y2) * r, (t2 * z1 - t1 * z2) * r);
+        Vector3 tdir((s1 * x2 - s2 * x1) * r, (s1 * y2 - s2 * y1) * r, (s1 * z2 - s2 * z1) * r);
+
+        Vector3 Jsdir = cross(N, sdir);
+        Vector3 Jtdir = cross(N, tdir);
+
+        float x = 3;
+    }
+
+    // check: sdir == grad_u
+    // check: tdir == grad_v
+
+    return length(grad_u - Jgrad_v);
+}
+#endif // 0
+
+
+ParameterizationQuality::ParameterizationQuality()
+{
+    m_totalTriangleCount = 0;
+    m_flippedTriangleCount = 0;
+    m_zeroAreaTriangleCount = 0;
+
+    m_parametricArea = 0.0f;
+    m_geometricArea = 0.0f;
+
+    m_stretchMetric = 0.0f;
+    m_maxStretchMetric = 0.0f;
+
+    m_conformalMetric = 0.0f;
+    m_authalicMetric = 0.0f;
+}
+
+ParameterizationQuality::ParameterizationQuality(const HalfEdge::Mesh * mesh)
+{
+    nvDebugCheck(mesh != NULL);
+
+    m_totalTriangleCount = 0;
+    m_flippedTriangleCount = 0;
+    m_zeroAreaTriangleCount = 0;
+
+    m_parametricArea = 0.0f;
+    m_geometricArea = 0.0f;
+
+    m_stretchMetric = 0.0f;
+    m_maxStretchMetric = 0.0f;
+
+    m_conformalMetric = 0.0f;
+    m_authalicMetric = 0.0f;
+
+    const uint faceCount = mesh->faceCount();
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = mesh->faceAt(f);
+        const HalfEdge::Vertex * vertex0 = NULL;
+
+        Vector3 p[3];
+        Vector2 t[3];
+
+        for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+        {
+            const HalfEdge::Edge * edge = it.current();
+
+            if (vertex0 == NULL)
+            {
+                vertex0 = edge->vertex;
+
+                p[0] = vertex0->pos;
+                t[0] = vertex0->tex;
+            }
+            else if (edge->to() != vertex0)
+            {
+                p[1] = edge->from()->pos;
+                p[2] = edge->to()->pos;
+                t[1] = edge->from()->tex;
+                t[2] = edge->to()->tex;
+
+                processTriangle(p, t);
+            }
+        }
+    }
+
+    if (m_flippedTriangleCount + m_zeroAreaTriangleCount == faceCount)
+    {
+        // If all triangles are flipped, then none is.
+        m_flippedTriangleCount = 0;
+    }
+
+    nvDebugCheck(isFinite(m_parametricArea) && m_parametricArea >= 0);
+    nvDebugCheck(isFinite(m_geometricArea) && m_geometricArea >= 0);
+    nvDebugCheck(isFinite(m_stretchMetric));
+    nvDebugCheck(isFinite(m_maxStretchMetric));
+    nvDebugCheck(isFinite(m_conformalMetric));
+    nvDebugCheck(isFinite(m_authalicMetric));
+}
+
+bool ParameterizationQuality::isValid() const
+{
+    return m_flippedTriangleCount == 0; // @@ Does not test for self-overlaps.
+}
+
+float ParameterizationQuality::rmsStretchMetric() const
+{
+    if (m_geometricArea == 0) return 0.0f;
+    float normFactor = sqrtf(m_parametricArea / m_geometricArea);
+    return sqrtf(m_stretchMetric / m_geometricArea) * normFactor;
+}
+
+float ParameterizationQuality::maxStretchMetric() const
+{
+    if (m_geometricArea == 0) return 0.0f;
+    float normFactor = sqrtf(m_parametricArea / m_geometricArea);
+    return m_maxStretchMetric * normFactor;
+}
+
+float ParameterizationQuality::rmsConformalMetric() const
+{
+    if (m_geometricArea == 0) return 0.0f;
+    return sqrtf(m_conformalMetric / m_geometricArea);
+}
+
+float ParameterizationQuality::maxAuthalicMetric() const
+{
+    if (m_geometricArea == 0) return 0.0f;
+    return sqrtf(m_authalicMetric / m_geometricArea);
+}
+
+void ParameterizationQuality::operator += (const ParameterizationQuality & pq)
+{
+    m_totalTriangleCount += pq.m_totalTriangleCount;
+    m_flippedTriangleCount += pq.m_flippedTriangleCount;
+    m_zeroAreaTriangleCount += pq.m_zeroAreaTriangleCount;
+
+    m_parametricArea += pq.m_parametricArea;
+    m_geometricArea += pq.m_geometricArea;
+
+    m_stretchMetric += pq.m_stretchMetric;
+    m_maxStretchMetric = max(m_maxStretchMetric, pq.m_maxStretchMetric);
+
+    m_conformalMetric += pq.m_conformalMetric;
+    m_authalicMetric += pq.m_authalicMetric;
+}
+
+
+void ParameterizationQuality::processTriangle(Vector3 q[3], Vector2 p[3])
+{
+    m_totalTriangleCount++;
+
+    // Evaluate texture stretch metric. See:
+    // - "Texture Mapping Progressive Meshes", Sander, Snyder, Gortler & Hoppe
+    // - "Mesh Parameterization: Theory and Practice", Siggraph'07 Course Notes, Hormann, Levy & Sheffer.
+
+    float t1 = p[0].x;
+    float s1 = p[0].y;
+    float t2 = p[1].x;
+    float s2 = p[1].y;
+    float t3 = p[2].x;
+    float s3 = p[2].y;
+
+    float geometricArea = length(cross(q[1] - q[0], q[2] - q[0])) / 2;
+    float parametricArea = ((s2 - s1)*(t3 - t1) - (s3 - s1)*(t2 - t1)) / 2;
+    
+    if (isZero(parametricArea))
+    {
+        m_zeroAreaTriangleCount++;
+        return;
+    }
+
+    Vector3 Ss = (q[0] * (t2- t3) + q[1] * (t3 - t1) + q[2] * (t1 - t2)) / (2 * parametricArea);
+    Vector3 St = (q[0] * (s3- s2) + q[1] * (s1 - s3) + q[2] * (s2 - s1)) / (2 * parametricArea);
+
+    float a = dot(Ss, Ss); // E
+    float b = dot(Ss, St); // F
+    float c = dot(St, St); // G
+
+    // Compute eigen-values of the first fundamental form:
+    float sigma1 = sqrtf(0.5f * max(0.0f, a + c - sqrtf(square(a - c) + 4 * square(b)))); // gamma uppercase, min eigenvalue.
+    float sigma2 = sqrtf(0.5f * max(0.0f, a + c + sqrtf(square(a - c) + 4 * square(b)))); // gamma lowercase, max eigenvalue.
+    nvCheck(sigma2 >= sigma1);
+
+    // isometric: sigma1 = sigma2 = 1
+    // conformal: sigma1 / sigma2 = 1
+    // authalic: sigma1 * sigma2 = 1
+
+    float rmsStretch = sqrtf((a + c) * 0.5f);
+    float rmsStretch2 = sqrtf((square(sigma1) + square(sigma2)) * 0.5f);
+    nvDebugCheck(equal(rmsStretch, rmsStretch2, 0.01f));
+
+    if (parametricArea < 0.0f)
+    {
+        // Count flipped triangles.
+        m_flippedTriangleCount++;
+
+        parametricArea = fabsf(parametricArea);
+    }
+
+    m_stretchMetric += square(rmsStretch) * geometricArea;
+    m_maxStretchMetric = max(m_maxStretchMetric, sigma2);
+
+    if (!isZero(sigma1, 0.000001f)) {
+        // sigma1 is zero when geometricArea is zero.
+        m_conformalMetric += (sigma2 / sigma1) * geometricArea;
+    }
+    m_authalicMetric += (sigma1 * sigma2) * geometricArea;
+
+    // Accumulate total areas.
+    m_geometricArea += geometricArea;
+    m_parametricArea += parametricArea;
+
+
+    //triangleConformalEnergy(q, p);
+}
diff --git a/thirdparty/thekla_atlas/nvmesh/param/ParameterizationQuality.h b/thirdparty/thekla_atlas/nvmesh/param/ParameterizationQuality.h
new file mode 100644
index 0000000000..342e26b889
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/ParameterizationQuality.h
@@ -0,0 +1,56 @@
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
+
+#pragma once
+#ifndef NV_MESH_PARAMETERIZATIONQUALITY_H
+#define NV_MESH_PARAMETERIZATIONQUALITY_H
+
+#include <nvmesh/nvmesh.h>
+
+namespace nv
+{
+    class Vector2;
+    class Vector3;
+
+    namespace HalfEdge { class Mesh; }
+
+    // Estimate quality of existing parameterization.
+    NVMESH_CLASS class ParameterizationQuality
+    {
+    public:
+        ParameterizationQuality();
+        ParameterizationQuality(const HalfEdge::Mesh * mesh);
+
+        bool isValid() const;
+
+        float rmsStretchMetric() const;
+        float maxStretchMetric() const;
+
+        float rmsConformalMetric() const;
+        float maxAuthalicMetric() const;
+
+        void operator += (const ParameterizationQuality & pq);
+
+    private:
+
+        void processTriangle(Vector3 p[3], Vector2 t[3]);
+
+    private:
+
+        uint m_totalTriangleCount;
+        uint m_flippedTriangleCount;
+        uint m_zeroAreaTriangleCount;
+
+        float m_parametricArea;
+        float m_geometricArea;
+
+        float m_stretchMetric;
+        float m_maxStretchMetric;
+
+        float m_conformalMetric;
+        float m_authalicMetric;
+
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_PARAMETERIZATIONQUALITY_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/SingleFaceMap.cpp b/thirdparty/thekla_atlas/nvmesh/param/SingleFaceMap.cpp
new file mode 100644
index 0000000000..4b205de8bf
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/SingleFaceMap.cpp
@@ -0,0 +1,53 @@
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
+
+#include "nvmesh.h" // pch
+
+#include "SingleFaceMap.h"
+
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Vertex.h"
+#include "nvmesh/halfedge/Face.h"
+
+#include "nvmath/Vector.inl"
+
+using namespace nv;
+
+
+
+void nv::computeSingleFaceMap(HalfEdge::Mesh * mesh)
+{
+    nvDebugCheck(mesh != NULL);
+    nvDebugCheck(mesh->faceCount() == 1);
+
+    HalfEdge::Face * face = mesh->faceAt(0);
+    nvCheck(face != NULL);
+
+    Vector3 p0 = face->edge->from()->pos;
+    Vector3 p1 = face->edge->to()->pos;
+
+    Vector3 X = normalizeSafe(p1 - p0, Vector3(0.0f), 0.0f);
+    Vector3 Z = face->normal();
+    Vector3 Y = normalizeSafe(cross(Z, X), Vector3(0.0f), 0.0f);
+
+    uint i = 0;
+    for (HalfEdge::Face::EdgeIterator it(face->edges()); !it.isDone(); it.advance(), i++)
+    {
+        HalfEdge::Vertex * vertex = it.vertex();
+        nvCheck(vertex != NULL);
+
+        if (i == 0)
+        {
+            vertex->tex = Vector2(0);
+        }
+        else
+        {
+            Vector3 pn = vertex->pos;
+
+            float xn = dot((pn - p0), X);
+            float yn = dot((pn - p0), Y);
+
+            vertex->tex = Vector2(xn, yn);
+        }
+    }
+}
+
diff --git a/thirdparty/thekla_atlas/nvmesh/param/SingleFaceMap.h b/thirdparty/thekla_atlas/nvmesh/param/SingleFaceMap.h
new file mode 100644
index 0000000000..b70719f5d8
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/SingleFaceMap.h
@@ -0,0 +1,18 @@
+// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
+
+#pragma once
+#ifndef NV_MESH_SINGLEFACEMAP_H
+#define NV_MESH_SINGLEFACEMAP_H
+
+namespace nv
+{
+    namespace HalfEdge
+    {
+        class Mesh;
+    }
+
+    void computeSingleFaceMap(HalfEdge::Mesh * mesh);
+
+} // nv namespace
+
+#endif // NV_MESH_SINGLEFACEMAP_H
diff --git a/thirdparty/thekla_atlas/nvmesh/param/Util.cpp b/thirdparty/thekla_atlas/nvmesh/param/Util.cpp
new file mode 100644
index 0000000000..fe7b58edf8
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/Util.cpp
@@ -0,0 +1,326 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "nvmesh.h" // pch
+
+#include "Util.h"
+
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Face.h"
+#include "nvmesh/halfedge/Vertex.h"
+
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Array.inl"
+
+
+using namespace nv;
+
+// Determine if the given mesh is a	quad mesh.
+bool nv::isQuadMesh(const HalfEdge::Mesh * mesh)
+{
+    nvDebugCheck(mesh != NULL);
+
+    const uint faceCount = mesh->faceCount();
+    for(uint i = 0; i < faceCount; i++) {
+        const HalfEdge::Face * face = mesh->faceAt(i);
+        if (face->edgeCount() != 4) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool nv::isTriangularMesh(const HalfEdge::Mesh * mesh)
+{
+    for (HalfEdge::Mesh::ConstFaceIterator it(mesh->faces()); !it.isDone(); it.advance())
+    {
+        const HalfEdge::Face * face = it.current();
+        if (face->edgeCount() != 3) return false;
+    }
+    return true;
+}
+
+
+uint nv::countMeshTriangles(const HalfEdge::Mesh * mesh)
+{
+    const uint faceCount = mesh->faceCount();
+
+    uint triangleCount = 0;
+
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = mesh->faceAt(f);
+        
+        uint edgeCount = face->edgeCount();
+        nvDebugCheck(edgeCount > 2);
+
+        triangleCount += edgeCount - 2;
+    }
+
+    return triangleCount;
+}
+
+const HalfEdge::Vertex * nv::findBoundaryVertex(const HalfEdge::Mesh * mesh)
+{
+    const uint vertexCount = mesh->vertexCount();
+
+    for (uint v = 0; v < vertexCount; v++)
+    {
+        const HalfEdge::Vertex * vertex = mesh->vertexAt(v);
+        if (vertex->isBoundary()) return vertex;
+    }
+
+    return NULL;
+}
+
+
+HalfEdge::Mesh * nv::unifyVertices(const HalfEdge::Mesh * inputMesh)
+{
+    HalfEdge::Mesh * mesh = new HalfEdge::Mesh;
+    
+    // Only add the first colocal.
+    const uint vertexCount = inputMesh->vertexCount();
+    for (uint v = 0; v < vertexCount; v++) {
+        const HalfEdge::Vertex * vertex = inputMesh->vertexAt(v);
+        
+        if (vertex->isFirstColocal()) {
+            mesh->addVertex(vertex->pos);
+        }
+    }
+
+    nv::Array<uint> indexArray;
+
+    // Add new faces pointing to first colocals.
+    uint faceCount = inputMesh->faceCount();
+    for (uint f = 0; f < faceCount; f++) {
+        const HalfEdge::Face * face = inputMesh->faceAt(f);
+
+        indexArray.clear();
+
+        for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance()) {
+            const HalfEdge::Edge * edge = it.current();
+            const HalfEdge::Vertex * vertex = edge->vertex->firstColocal();
+
+            indexArray.append(vertex->id);
+        }
+
+        mesh->addFace(indexArray);
+    }
+
+    mesh->linkBoundary();
+
+    return mesh;
+}
+
+#include "nvmath/Basis.h"
+
+static bool pointInTriangle(const Vector2 & p, const Vector2 & a, const Vector2 & b, const Vector2 & c)
+{
+    return triangleArea(a, b, p) >= 0.00001f && 
+        triangleArea(b, c, p) >= 0.00001f && 
+        triangleArea(c, a, p) >= 0.00001f; 
+}
+
+
+// This is doing a simple ear-clipping algorithm that skips invalid triangles. Ideally, we should
+// also sort the ears by angle, start with the ones that have the smallest angle and proceed in order.
+HalfEdge::Mesh * nv::triangulate(const HalfEdge::Mesh * inputMesh)
+{
+    HalfEdge::Mesh * mesh = new HalfEdge::Mesh;
+    
+    // Add all vertices.
+    const uint vertexCount = inputMesh->vertexCount();
+    for (uint v = 0; v < vertexCount; v++) {
+        const HalfEdge::Vertex * vertex = inputMesh->vertexAt(v);
+        mesh->addVertex(vertex->pos);
+    }
+
+    Array<int> polygonVertices;
+    Array<float> polygonAngles;
+    Array<Vector2> polygonPoints;
+
+    const uint faceCount = inputMesh->faceCount();
+    for (uint f = 0; f < faceCount; f++)
+    {
+        const HalfEdge::Face * face = inputMesh->faceAt(f);
+        nvDebugCheck(face != NULL);
+
+        const uint edgeCount = face->edgeCount();
+        nvDebugCheck(edgeCount >= 3);
+
+        polygonVertices.clear();
+        polygonVertices.reserve(edgeCount);
+
+        if (edgeCount == 3) {
+            // Simple case for triangles.
+            for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+            {
+                const HalfEdge::Edge * edge = it.current();
+                const HalfEdge::Vertex * vertex = edge->vertex;
+                polygonVertices.append(vertex->id);
+            }
+
+            int v0 = polygonVertices[0];
+            int v1 = polygonVertices[1];
+            int v2 = polygonVertices[2];
+
+            mesh->addFace(v0, v1, v2);
+        }
+        else {
+            // Build 2D polygon projecting vertices onto normal plane.
+            // Faces are not necesarily planar, this is for example the case, when the face comes from filling a hole. In such cases
+            // it's much better to use the best fit plane.
+            const Vector3 fn = face->normal();
+
+            Basis basis;
+            basis.buildFrameForDirection(fn);
+
+            polygonPoints.clear();
+            polygonPoints.reserve(edgeCount);
+            polygonAngles.clear();
+            polygonAngles.reserve(edgeCount);
+
+            for (HalfEdge::Face::ConstEdgeIterator it(face->edges()); !it.isDone(); it.advance())
+            {
+                const HalfEdge::Edge * edge = it.current();
+                const HalfEdge::Vertex * vertex = edge->vertex;
+                polygonVertices.append(vertex->id);
+                
+                Vector2 p;
+                p.x = dot(basis.tangent, vertex->pos);
+                p.y = dot(basis.bitangent, vertex->pos);
+
+                polygonPoints.append(p);
+            }
+            polygonAngles.resize(edgeCount);
+
+            while (polygonVertices.size() > 2) {
+                uint size = polygonVertices.size();
+
+                // Update polygon angles. @@ Update only those that have changed.
+                float minAngle = 2 * PI;
+                uint bestEar = 0; // Use first one if none of them is valid.
+                bool bestIsValid = false;
+                for (uint i = 0; i < size; i++) {
+                    uint i0 = i; 
+                    uint i1 = (i+1) % size; // Use Sean's polygon interation trick.
+                    uint i2 = (i+2) % size;
+
+                    Vector2 p0 = polygonPoints[i0];
+                    Vector2 p1 = polygonPoints[i1];
+                    Vector2 p2 = polygonPoints[i2];
+
+                    float d = clamp(dot(p0-p1, p2-p1) / (length(p0-p1) * length(p2-p1)), -1.0f, 1.0f);
+                    float angle = acosf(d);
+                    
+                    float area = triangleArea(p0, p1, p2);
+                    if (area < 0.0f) angle = 2.0f * PI - angle;
+
+                    polygonAngles[i1] = angle;
+
+                    if (angle < minAngle || !bestIsValid) {
+
+                        // Make sure this is a valid ear, if not, skip this point.
+                        bool valid = true;
+                        for (uint j = 0; j < size; j++) {
+                            if (j == i0 || j == i1 || j == i2) continue;
+                            Vector2 p = polygonPoints[j];
+
+                            if (pointInTriangle(p, p0, p1, p2)) {
+                                valid = false;
+                                break;
+                            }
+                        }
+
+                        if (valid || !bestIsValid) {
+                            minAngle = angle;
+                            bestEar = i1;
+                            bestIsValid = valid;
+                        }
+                    }
+                }
+
+                nvDebugCheck(minAngle <= 2 * PI);
+
+                // Clip best ear:
+
+                uint i0 = (bestEar+size-1) % size;
+                uint i1 = (bestEar+0) % size;
+                uint i2 = (bestEar+1) % size;
+
+                int v0 = polygonVertices[i0];
+                int v1 = polygonVertices[i1];
+                int v2 = polygonVertices[i2];
+                
+                mesh->addFace(v0, v1, v2);
+
+                polygonVertices.removeAt(i1);
+                polygonPoints.removeAt(i1);
+                polygonAngles.removeAt(i1);
+            }
+        }
+
+#if 0
+
+        uint i = 0;
+        while (polygonVertices.size() > 2 && i < polygonVertices.size()) {
+            uint size = polygonVertices.size();
+            uint i0 = (i+0) % size;
+            uint i1 = (i+1) % size;
+            uint i2 = (i+2) % size;
+
+            const HalfEdge::Vertex * v0 = polygonVertices[i0];
+            const HalfEdge::Vertex * v1 = polygonVertices[i1];
+            const HalfEdge::Vertex * v2 = polygonVertices[i2];
+
+            const Vector3 p0 = v0->pos;
+            const Vector3 p1 = v1->pos;
+            const Vector3 p2 = v2->pos;
+
+            const Vector3 e0 = p2 - p1;
+            const Vector3 e1 = p0 - p1;
+
+            // If this ear forms a valid triangle, setup relations, remove v1 and repeat.
+            Vector3 n = cross(e0, e1);
+            float len = dot(fn, n); // = sin(angle)
+            
+            float angle = asin(len);
+
+
+            if (len > 0.0f) {
+                mesh->addFace(v0->id(), v1->id(), v2->id());
+                polygonVertices.removeAt(i1);
+                polygonAngles.removeAt(i1);
+                if (i2 > i1) i2--;
+                // @@ Update angles at i0 and i2
+            }
+            else {
+                i++;
+            }
+        }
+
+        // @@ Create a few degenerate triangles to avoid introducing holes.
+        i = 0;
+        const uint size = polygonVertices.size();
+        while (i < size - 2) {
+            uint i0 = (i+0) % size;
+            uint i1 = (i+1) % size;
+            uint i2 = (i+2) % size;
+
+            const HalfEdge::Vertex * v0 = polygonVertices[i0];
+            const HalfEdge::Vertex * v1 = polygonVertices[i1];
+            const HalfEdge::Vertex * v2 = polygonVertices[i2];
+
+            mesh->addFace(v0->id(), v1->id(), v2->id());
+            i++;
+        }
+#endif
+    }
+
+    mesh->linkBoundary();
+
+    return mesh;
+}
+
+
diff --git a/thirdparty/thekla_atlas/nvmesh/param/Util.h b/thirdparty/thekla_atlas/nvmesh/param/Util.h
new file mode 100644
index 0000000000..774563ac0b
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/param/Util.h
@@ -0,0 +1,18 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "nvmesh/nvmesh.h"
+
+namespace nv {
+
+    namespace HalfEdge { class Mesh; class Vertex; }
+
+    bool isQuadMesh(const HalfEdge::Mesh * mesh);
+    bool isTriangularMesh(const HalfEdge::Mesh * mesh);
+
+    uint countMeshTriangles(const HalfEdge::Mesh * mesh);
+    const HalfEdge::Vertex * findBoundaryVertex(const HalfEdge::Mesh * mesh);
+
+    HalfEdge::Mesh * unifyVertices(const HalfEdge::Mesh * inputMesh);
+    HalfEdge::Mesh * triangulate(const HalfEdge::Mesh * inputMesh);
+
+} // nv namespace
diff --git a/thirdparty/thekla_atlas/nvmesh/raster/ClippedTriangle.h b/thirdparty/thekla_atlas/nvmesh/raster/ClippedTriangle.h
new file mode 100644
index 0000000000..0947d4851c
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/raster/ClippedTriangle.h
@@ -0,0 +1,159 @@
+// Copyright NVIDIA Corporation 2007 -- Denis Kovacs <den.kovacs@gmail.com>
+
+#pragma once
+#ifndef NV_MESH_CLIPPEDTRIANGLE_H
+#define NV_MESH_CLIPPEDTRIANGLE_H
+
+#include <nvmath/Vector.h>
+
+namespace nv
+{
+
+    class ClippedTriangle
+    {
+    public:
+        ClippedTriangle(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c) 
+        {
+            m_numVertices = 3;
+            m_activeVertexBuffer = 0;
+
+            m_verticesA[0]=a;
+            m_verticesA[1]=b;
+            m_verticesA[2]=c;
+
+            m_vertexBuffers[0] = m_verticesA;
+            m_vertexBuffers[1] = m_verticesB;
+        }
+
+        uint vertexCount()
+        {
+            return m_numVertices;
+        }
+
+        const Vector2 * vertices()
+        {
+            return m_vertexBuffers[m_activeVertexBuffer];
+        }
+
+        inline void clipHorizontalPlane(float offset, float clipdirection) 
+        {
+            Vector2 * v  = m_vertexBuffers[m_activeVertexBuffer];
+            m_activeVertexBuffer ^= 1;
+            Vector2 * v2 = m_vertexBuffers[m_activeVertexBuffer];
+
+            v[m_numVertices] = v[0];
+
+            float dy2,   dy1 = offset - v[0].y;
+            int   dy2in, dy1in = clipdirection*dy1 >= 0;
+            uint  p=0;
+
+            for (uint k=0; k<m_numVertices; k++)
+            {
+                dy2   = offset - v[k+1].y;
+                dy2in = clipdirection*dy2 >= 0;
+
+                if (dy1in) v2[p++] = v[k];
+
+                if ( dy1in + dy2in == 1 ) // not both in/out
+                {
+                    float dx = v[k+1].x - v[k].x;
+                    float dy = v[k+1].y - v[k].y;
+                    v2[p++] = Vector2(v[k].x + dy1*(dx/dy), offset);
+                }
+
+                dy1 = dy2; dy1in = dy2in;
+            }
+            m_numVertices = p;
+
+            //for (uint k=0; k<m_numVertices; k++) printf("(%f, %f)\n", v2[k].x, v2[k].y); printf("\n");
+        }
+
+        inline void clipVerticalPlane(float offset, float clipdirection ) 
+        {
+            Vector2 * v  = m_vertexBuffers[m_activeVertexBuffer];
+            m_activeVertexBuffer ^= 1;
+            Vector2 * v2 = m_vertexBuffers[m_activeVertexBuffer];
+
+            v[m_numVertices] = v[0];
+
+            float dx2,   dx1   = offset - v[0].x;
+            int   dx2in, dx1in = clipdirection*dx1 >= 0;
+            uint  p=0;
+
+            for (uint k=0; k<m_numVertices; k++)
+            {
+                dx2 = offset - v[k+1].x;
+                dx2in = clipdirection*dx2 >= 0;
+
+                if (dx1in) v2[p++] = v[k];
+
+                if ( dx1in + dx2in == 1 ) // not both in/out
+                {
+                    float dx = v[k+1].x - v[k].x;
+                    float dy = v[k+1].y - v[k].y;
+                    v2[p++] = Vector2(offset, v[k].y + dx1*(dy/dx));
+                }
+
+                dx1 = dx2; dx1in = dx2in;
+            }
+            m_numVertices = p;
+
+            //for (uint k=0; k<m_numVertices; k++) printf("(%f, %f)\n", v2[k].x, v2[k].y); printf("\n");
+        }
+
+        void computeAreaCentroid()
+        {
+            Vector2 * v  = m_vertexBuffers[m_activeVertexBuffer];
+            v[m_numVertices] = v[0];
+
+            m_area = 0;
+            float centroidx=0, centroidy=0;
+            for (uint k=0; k<m_numVertices; k++)
+            {
+                // http://local.wasp.uwa.edu.au/~pbourke/geometry/polyarea/
+                float f = v[k].x*v[k+1].y - v[k+1].x*v[k].y;
+                m_area += f;
+                centroidx += f * (v[k].x + v[k+1].x);
+                centroidy += f * (v[k].y + v[k+1].y);
+            }
+            m_area = 0.5f * fabs(m_area);
+            if (m_area==0) {
+                m_centroid = Vector2(0.0f);
+            } else {
+                m_centroid = Vector2(centroidx/(6*m_area), centroidy/(6*m_area));
+            }
+        }
+
+        void clipAABox(float x0, float y0, float x1, float y1)
+        {
+            clipVerticalPlane  ( x0, -1);
+            clipHorizontalPlane( y0, -1);
+            clipVerticalPlane  ( x1,  1);
+            clipHorizontalPlane( y1,  1);
+
+            computeAreaCentroid();
+        }
+
+        Vector2 centroid()
+        {
+            return m_centroid;
+        }
+
+        float area()
+        {
+            return m_area;
+        }
+
+    private:
+        Vector2 m_verticesA[7+1];
+        Vector2 m_verticesB[7+1];
+        Vector2 * m_vertexBuffers[2];
+        uint    m_numVertices;
+        uint    m_activeVertexBuffer;
+        float   m_area;
+        Vector2 m_centroid;
+    };
+
+} // nv namespace
+
+#endif // NV_MESH_CLIPPEDTRIANGLE_H
diff --git a/thirdparty/thekla_atlas/nvmesh/raster/Raster.cpp b/thirdparty/thekla_atlas/nvmesh/raster/Raster.cpp
new file mode 100644
index 0000000000..d46b34f045
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/raster/Raster.cpp
@@ -0,0 +1,626 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+/** @file Raster.cpp
+ * @brief Triangle rasterization library using affine interpolation. Not
+ * specially optimized, but enough for my purposes.
+**/
+
+#include "nvmesh.h" // pch
+
+#include "Raster.h"
+#include "ClippedTriangle.h"
+
+#include "nvcore/Utils.h" // min, max
+
+#include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
+
+
+#define RA_EPSILON		0.00001f
+
+using namespace nv;
+using namespace nv::Raster;
+
+namespace
+{
+    static inline float delta(float bot, float top, float ih)
+    {
+        return (bot - top) * ih;
+    }
+
+    static inline Vector2 delta(Vector2::Arg bot, Vector2::Arg top, float ih)
+    {
+        return (bot - top) * ih;
+    }
+
+    static inline Vector3 delta(Vector3::Arg bot, Vector3::Arg top, float ih)
+    {
+        return (bot - top) * ih;
+    }
+
+    // @@ The implementation in nvmath.h should be equivalent.
+    static inline int iround(float f)
+    {
+        // @@ Optimize this.
+        return int(floorf(f+0.5f));
+        //return int(round(f));
+        //return int(f);
+    }
+
+    /// A triangle vertex. 
+    struct Vertex
+    {
+        Vector2 pos;	// Position.
+        Vector3 tex;	// Texcoord. (Barycentric coordinate)
+    };
+
+
+    /// A triangle for rasterization.
+    struct Triangle
+    {
+        Triangle(Vector2::Arg v0, Vector2::Arg v1, Vector2::Arg v2, Vector3::Arg t0, Vector3::Arg t1, Vector3::Arg t2);
+
+        bool computeDeltas();
+
+        bool draw(const Vector2 & extents, bool enableScissors, SamplingCallback cb, void * param);
+        bool drawAA(const Vector2 & extents, bool enableScissors, SamplingCallback cb, void * param);
+        bool drawC(const Vector2 & extents, bool enableScissors, SamplingCallback cb, void * param);
+        void flipBackface();
+        void computeUnitInwardNormals();
+
+        // Vertices.	
+        Vector2 v1, v2, v3;
+        Vector2 n1, n2, n3; // unit inward normals
+        Vector3 t1, t2, t3;
+
+        // Deltas.
+        Vector3 dx, dy;
+
+        float sign;
+        bool valid;
+    };
+
+
+    /// Triangle ctor.
+    Triangle::Triangle(Vector2::Arg v0, Vector2::Arg v1, Vector2::Arg v2, 
+        Vector3::Arg t0, Vector3::Arg t1, Vector3::Arg t2)
+    {
+        // Init vertices.
+        this->v1 = v0;
+        this->v2 = v2;
+        this->v3 = v1;
+
+        // Set barycentric coordinates.
+        this->t1 = t0;
+        this->t2 = t2;
+        this->t3 = t1;
+
+        // make sure every triangle is front facing.
+        flipBackface();
+
+        // Compute deltas.
+        valid = computeDeltas();
+
+        computeUnitInwardNormals();
+    }
+
+
+    /// Compute texture space deltas.
+    /// This method takes two edge vectors that form a basis, determines the 
+    /// coordinates of the canonic vectors in that basis, and computes the 
+    /// texture gradient that corresponds to those vectors.
+    bool Triangle::computeDeltas()
+    {
+        Vector2 e0 = v3 - v1;
+        Vector2 e1 = v2 - v1;
+
+        Vector3 de0 = t3 - t1;
+        Vector3 de1 = t2 - t1;
+
+        float denom = 1.0f / (e0.y * e1.x - e1.y * e0.x);
+        if (!isFinite(denom)) {
+            return false;
+        }
+
+        float lambda1 = - e1.y * denom;
+        float lambda2 = e0.y * denom;
+        float lambda3 = e1.x * denom;
+        float lambda4 = - e0.x * denom;
+
+        dx = de0 * lambda1 + de1 * lambda2;
+        dy = de0 * lambda3 + de1 * lambda4;
+
+        return true;
+    }
+
+    // compute unit inward normals for each edge.
+    void Triangle::computeUnitInwardNormals()
+    {
+        n1 = v1 - v2; n1 = Vector2(-n1.y, n1.x); n1 = n1 * (1.0f/sqrtf(n1.x*n1.x + n1.y*n1.y));
+        n2 = v2 - v3; n2 = Vector2(-n2.y, n2.x); n2 = n2 * (1.0f/sqrtf(n2.x*n2.x + n2.y*n2.y));
+        n3 = v3 - v1; n3 = Vector2(-n3.y, n3.x); n3 = n3 * (1.0f/sqrtf(n3.x*n3.x + n3.y*n3.y));
+    }
+
+    void Triangle::flipBackface()
+    {
+        // check if triangle is backfacing, if so, swap two vertices
+        if ( ((v3.x-v1.x)*(v2.y-v1.y) - (v3.y-v1.y)*(v2.x-v1.x)) < 0 ) {
+            Vector2 hv=v1; v1=v2; v2=hv; // swap pos
+            Vector3 ht=t1; t1=t2; t2=ht; // swap tex
+        }
+    }
+
+    bool Triangle::draw(const Vector2 & extents, bool enableScissors, SamplingCallback cb, void * param)
+    {
+        // 28.4 fixed-point coordinates
+        const int Y1 = iround(16.0f * v1.y);
+        const int Y2 = iround(16.0f * v2.y);
+        const int Y3 = iround(16.0f * v3.y);
+
+        const int X1 = iround(16.0f * v1.x);
+        const int X2 = iround(16.0f * v2.x);
+        const int X3 = iround(16.0f * v3.x);
+
+        // Deltas
+        const int DX12 = X1 - X2;
+        const int DX23 = X2 - X3;
+        const int DX31 = X3 - X1;
+
+        const int DY12 = Y1 - Y2;
+        const int DY23 = Y2 - Y3;
+        const int DY31 = Y3 - Y1;
+
+        // Fixed-point deltas
+        const int FDX12 = DX12 << 4;
+        const int FDX23 = DX23 << 4;
+        const int FDX31 = DX31 << 4;
+
+        const int FDY12 = DY12 << 4;
+        const int FDY23 = DY23 << 4;
+        const int FDY31 = DY31 << 4;
+
+        int minx, miny, maxx, maxy;
+        if (enableScissors) {
+            int frustumX0 =  0 << 4;
+            int frustumY0 =  0 << 4;
+            int frustumX1 =  (int)extents.x << 4;
+            int frustumY1 =  (int)extents.y << 4;
+
+            // Bounding rectangle
+            minx = (nv::max(min3(X1, X2, X3), frustumX0) + 0xF) >> 4;
+            miny = (nv::max(min3(Y1, Y2, Y3), frustumY0) + 0xF) >> 4;
+            maxx = (nv::min(max3(X1, X2, X3), frustumX1) + 0xF) >> 4;
+            maxy = (nv::min(max3(Y1, Y2, Y3), frustumY1) + 0xF) >> 4;
+        }
+        else {
+            // Bounding rectangle
+            minx = (min3(X1, X2, X3) + 0xF) >> 4;
+            miny = (min3(Y1, Y2, Y3) + 0xF) >> 4;
+            maxx = (max3(X1, X2, X3) + 0xF) >> 4;
+            maxy = (max3(Y1, Y2, Y3) + 0xF) >> 4;
+        }
+
+        // Block size, standard 8x8 (must be power of two)
+        const int q = 8;
+
+        // @@ This won't work when minx,miny are negative. This code path is not used. Leaving as is for now.
+        nvCheck(minx >= 0);
+        nvCheck(miny >= 0);
+
+        // Start in corner of 8x8 block
+        minx &= ~(q - 1);
+        miny &= ~(q - 1);
+
+        // Half-edge constants
+        int C1 = DY12 * X1 - DX12 * Y1;
+        int C2 = DY23 * X2 - DX23 * Y2;
+        int C3 = DY31 * X3 - DX31 * Y3;
+
+        // Correct for fill convention
+        if(DY12 < 0 || (DY12 == 0 && DX12 > 0)) C1++;
+        if(DY23 < 0 || (DY23 == 0 && DX23 > 0)) C2++;
+        if(DY31 < 0 || (DY31 == 0 && DX31 > 0)) C3++;
+
+        // Loop through blocks
+        for(int y = miny; y < maxy; y += q)
+        {
+            for(int x = minx; x < maxx; x += q)
+            {
+                // Corners of block
+                int x0 = x << 4;
+                int x1 = (x + q - 1) << 4;
+                int y0 = y << 4;
+                int y1 = (y + q - 1) << 4;
+
+                // Evaluate half-space functions
+                bool a00 = C1 + DX12 * y0 - DY12 * x0 > 0;
+                bool a10 = C1 + DX12 * y0 - DY12 * x1 > 0;
+                bool a01 = C1 + DX12 * y1 - DY12 * x0 > 0;
+                bool a11 = C1 + DX12 * y1 - DY12 * x1 > 0;
+                int a = (a00 << 0) | (a10 << 1) | (a01 << 2) | (a11 << 3);
+
+                bool b00 = C2 + DX23 * y0 - DY23 * x0 > 0;
+                bool b10 = C2 + DX23 * y0 - DY23 * x1 > 0;
+                bool b01 = C2 + DX23 * y1 - DY23 * x0 > 0;
+                bool b11 = C2 + DX23 * y1 - DY23 * x1 > 0;
+                int b = (b00 << 0) | (b10 << 1) | (b01 << 2) | (b11 << 3);
+
+                bool c00 = C3 + DX31 * y0 - DY31 * x0 > 0;
+                bool c10 = C3 + DX31 * y0 - DY31 * x1 > 0;
+                bool c01 = C3 + DX31 * y1 - DY31 * x0 > 0;
+                bool c11 = C3 + DX31 * y1 - DY31 * x1 > 0;
+                int c = (c00 << 0) | (c10 << 1) | (c01 << 2) | (c11 << 3);
+
+                // Skip block when outside an edge
+                if(a == 0x0 || b == 0x0 || c == 0x0) continue;		
+
+                // Accept whole block when totally covered
+                if(a == 0xF && b == 0xF && c == 0xF)
+                {
+                    Vector3 texRow = t1 + dy*(y0 - v1.y) + dx*(x0 - v1.x);
+
+                    for(int iy = y; iy < y + q; iy++)
+                    {
+                        Vector3 tex = texRow;
+                        for(int ix = x; ix < x + q; ix++)
+                        {
+                            //Vector3 tex = t1 + dx * (ix - v1.x) + dy * (iy - v1.y);
+                            if (!cb(param, ix, iy, tex, dx, dy, 1.0)) {
+                                // early out.
+                                return false;
+                            }
+                            tex += dx;
+                        }
+                        texRow += dy;
+                    }
+                }
+                else // Partially covered block
+                {
+                    int CY1 = C1 + DX12 * y0 - DY12 * x0;
+                    int CY2 = C2 + DX23 * y0 - DY23 * x0;
+                    int CY3 = C3 + DX31 * y0 - DY31 * x0;
+                    Vector3 texRow = t1 + dy*(y0 - v1.y) + dx*(x0 - v1.x);
+
+                    for(int iy = y; iy < y + q; iy++)
+                    {
+                        int CX1 = CY1;
+                        int CX2 = CY2;
+                        int CX3 = CY3;
+                        Vector3 tex = texRow;
+
+                        for(int ix = x; ix < x + q; ix++)
+                        {
+                            if(CX1 > 0 && CX2 > 0 && CX3 > 0)
+                            {
+                                if (!cb(param, ix, iy, tex, dx, dy, 1.0))
+                                {
+                                    // early out.
+                                    return false;
+                                }
+                            }
+
+                            CX1 -= FDY12;
+                            CX2 -= FDY23;
+                            CX3 -= FDY31;
+                            tex += dx;
+                        }
+
+                        CY1 += FDX12;
+                        CY2 += FDX23;
+                        CY3 += FDX31;
+                        texRow += dy;
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+
+#define PX_INSIDE    1.0f/sqrt(2.0f)
+#define PX_OUTSIDE  -1.0f/sqrt(2.0f)
+
+#define BK_SIZE 8
+#define BK_INSIDE   sqrt(BK_SIZE*BK_SIZE/2.0f)
+#define BK_OUTSIDE -sqrt(BK_SIZE*BK_SIZE/2.0f)
+
+    // extents has to be multiple of BK_SIZE!!
+    bool Triangle::drawAA(const Vector2 & extents, bool enableScissors, SamplingCallback cb, void * param)
+    {
+        float minx, miny, maxx, maxy;
+        if (enableScissors) {
+            // Bounding rectangle
+            minx = floorf(max(min3(v1.x, v2.x, v3.x), 0.0f));
+            miny = floorf(max(min3(v1.y, v2.y, v3.y), 0.0f));
+            maxx = ceilf( min(max3(v1.x, v2.x, v3.x), extents.x-1.0f));
+            maxy = ceilf( min(max3(v1.y, v2.y, v3.y), extents.y-1.0f));
+        }
+        else {
+            // Bounding rectangle
+            minx = floorf(min3(v1.x, v2.x, v3.x));
+            miny = floorf(min3(v1.y, v2.y, v3.y));
+            maxx = ceilf( max3(v1.x, v2.x, v3.x));
+            maxy = ceilf( max3(v1.y, v2.y, v3.y));
+        }
+
+        // There's no reason to align the blocks to the viewport, instead we align them to the origin of the triangle bounds.
+        minx = floorf(minx);
+        miny = floorf(miny);
+        //minx = (float)(((int)minx) & (~((int)BK_SIZE - 1))); // align to blocksize (we don't need to worry about blocks partially out of viewport)
+        //miny = (float)(((int)miny) & (~((int)BK_SIZE - 1)));
+
+        minx += 0.5; miny +=0.5;  // sampling at texel centers!
+        maxx += 0.5; maxy +=0.5; 
+
+        // Half-edge constants
+        float C1 = n1.x * (-v1.x) + n1.y * (-v1.y);
+        float C2 = n2.x * (-v2.x) + n2.y * (-v2.y);
+        float C3 = n3.x * (-v3.x) + n3.y * (-v3.y);
+
+        // Loop through blocks
+        for(float y0 = miny; y0 <= maxy; y0 += BK_SIZE)
+        {
+            for(float x0 = minx; x0 <= maxx; x0 += BK_SIZE)
+            {
+                // Corners of block
+                float xc = (x0 + (BK_SIZE-1)/2.0f);
+                float yc = (y0 + (BK_SIZE-1)/2.0f);
+
+                // Evaluate half-space functions
+                float aC = C1 + n1.x * xc + n1.y * yc;
+                float bC = C2 + n2.x * xc + n2.y * yc;
+                float cC = C3 + n3.x * xc + n3.y * yc;
+
+                // Skip block when outside an edge
+                if( (aC <= BK_OUTSIDE) || (bC <= BK_OUTSIDE) || (cC <= BK_OUTSIDE) ) continue;
+
+                // Accept whole block when totally covered
+                if( (aC >= BK_INSIDE) && (bC >= BK_INSIDE) && (cC >= BK_INSIDE) )
+                {
+                    Vector3 texRow = t1 + dy*(y0 - v1.y) + dx*(x0 - v1.x);
+
+                    for (float y = y0; y < y0 + BK_SIZE; y++)
+                    {
+                        Vector3 tex = texRow;
+                        for(float x = x0; x < x0 + BK_SIZE; x++)
+                        {
+                            if (!cb(param, (int)x, (int)y, tex, dx, dy, 1.0f))
+                            {
+                                return false;
+                            }
+                            tex += dx;
+                        }
+                        texRow += dy;
+                    }
+                }
+                else // Partially covered block
+                {
+                    float CY1 = C1 + n1.x * x0 + n1.y * y0;
+                    float CY2 = C2 + n2.x * x0 + n2.y * y0;
+                    float CY3 = C3 + n3.x * x0 + n3.y * y0;
+                    Vector3 texRow = t1 + dy*(y0 - v1.y) + dx*(x0 - v1.x);	                  	
+
+                    for(float y = y0; y < y0 + BK_SIZE; y++) // @@ This is not clipping to scissor rectangle correctly.
+                    {
+                        float CX1 = CY1;
+                        float CX2 = CY2;
+                        float CX3 = CY3;
+                        Vector3 tex = texRow;
+
+                        for (float x = x0; x < x0 + BK_SIZE; x++)   // @@ This is not clipping to scissor rectangle correctly.
+                        {
+                            if (CX1 >= PX_INSIDE && CX2 >= PX_INSIDE && CX3 >= PX_INSIDE) 
+                            {
+                                // pixel completely covered
+                                Vector3 tex = t1 + dx * (x - v1.x) + dy * (y - v1.y);
+                                if (!cb(param, (int)x, (int)y, tex, dx, dy, 1.0f))
+                                {
+                                    return false;
+                                }
+                            }
+                            else if ((CX1 >= PX_OUTSIDE) && (CX2 >= PX_OUTSIDE) && (CX3 >= PX_OUTSIDE))
+                            {
+                                // triangle partially covers pixel. do clipping.
+                                ClippedTriangle ct(v1-Vector2(x,y), v2-Vector2(x,y), v3-Vector2(x,y));
+                                ct.clipAABox(-0.5, -0.5, 0.5, 0.5);
+                                Vector2 centroid = ct.centroid();
+                                float area = ct.area();
+                                if (area > 0.0f)
+                                {
+                                    Vector3 texCent = tex - dx*centroid.x - dy*centroid.y;
+                                    //nvCheck(texCent.x >= -0.1f && texCent.x <= 1.1f); // @@ Centroid is not very exact...
+                                    //nvCheck(texCent.y >= -0.1f && texCent.y <= 1.1f);
+                                    //nvCheck(texCent.z >= -0.1f && texCent.z <= 1.1f);
+                                    //Vector3 texCent2 = t1 + dx * (x - v1.x) + dy * (y - v1.y);
+                                    if (!cb(param, (int)x, (int)y, texCent, dx, dy, area))
+                                    {
+                                        return false;
+                                    }
+                                }
+                            }
+
+                            CX1 += n1.x;
+                            CX2 += n2.x;
+                            CX3 += n3.x;
+                            tex += dx;
+                        }
+
+                        CY1 += n1.y;
+                        CY2 += n2.y;
+                        CY3 += n3.y;
+                        texRow += dy;
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+} // namespace
+
+
+/// Process the given triangle.
+bool nv::Raster::drawTriangle(Mode mode, Vector2::Arg extents, bool enableScissors, const Vector2 v[3], SamplingCallback cb, void * param)
+{
+    Triangle tri(v[0], v[1], v[2], Vector3(1, 0, 0), Vector3(0, 1, 0), Vector3(0, 0, 1));
+    
+    // @@ It would be nice to have a conservative drawing mode that enlarges the triangle extents by one texel and is able to handle degenerate triangles.
+    // @@ Maybe the simplest thing to do would be raster triangle edges.
+
+    if (tri.valid) {
+        if (mode == Mode_Antialiased) {
+            return tri.drawAA(extents, enableScissors, cb, param);
+        } 
+        if (mode == Mode_Nearest) {
+            return tri.draw(extents, enableScissors, cb, param);
+        }
+    }
+
+    return true;
+}
+
+inline static float triangleArea(Vector2::Arg v1, Vector2::Arg v2, Vector2::Arg v3)
+{
+    return 0.5f * (v3.x * v1.y + v1.x * v2.y + v2.x * v3.y - v2.x * v1.y - v3.x * v2.y - v1.x * v3.y);
+}
+
+/// Process the given quad.
+bool nv::Raster::drawQuad(Mode mode, Vector2::Arg extents, bool enableScissors, const Vector2 v[4], SamplingCallback cb, void * param)
+{
+    bool sign0 = triangleArea(v[0], v[1], v[2]) > 0.0f;
+    bool sign1 = triangleArea(v[0], v[2], v[3]) > 0.0f;
+
+    // Divide the quad into two non overlapping triangles.
+    if (sign0 == sign1) {
+        Triangle tri0(v[0], v[1], v[2], Vector3(0,0,0), Vector3(1,0,0), Vector3(1,1,0));
+        Triangle tri1(v[0], v[2], v[3], Vector3(0,0,0), Vector3(1,1,0), Vector3(0,1,0));
+
+        if (tri0.valid && tri1.valid) {
+            if (mode == Mode_Antialiased) {
+                return tri0.drawAA(extents, enableScissors, cb, param) && tri1.drawAA(extents, enableScissors, cb, param);
+            } else {
+                return tri0.draw(extents, enableScissors, cb, param) && tri1.draw(extents, enableScissors, cb, param);
+            }
+        }
+    }
+    else
+    {
+        Triangle tri0(v[0], v[1], v[3], Vector3(0,0,0), Vector3(1,0,0), Vector3(0,1,0));
+        Triangle tri1(v[1], v[2], v[3], Vector3(1,0,0), Vector3(1,1,0), Vector3(0,1,0));
+
+        if (tri0.valid && tri1.valid) {
+            if (mode == Mode_Antialiased) {
+                return tri0.drawAA(extents, enableScissors, cb, param) && tri1.drawAA(extents, enableScissors, cb, param);
+            } else {
+                return tri0.draw(extents, enableScissors, cb, param) && tri1.draw(extents, enableScissors, cb, param);
+            }
+        }
+    }
+
+    return true;
+}
+
+
+static bool drawPoint(const Vector2 & p, const Vector2 v[2], LineSamplingCallback cb, void * param) {
+
+    int x = ftoi_round(p.x);
+    int y = ftoi_round(p.y);
+    Vector2 ip = Vector2(float(x) + 0.5f, float(y) + 0.5f);
+
+    float t;
+
+    // Return minimum distance between line segment vw and point p
+    Vector2 dv = v[1] - v[0];
+    const float l2 = nv::lengthSquared(dv);  // i.e. |w-v|^2 -  avoid a sqrt
+    if (l2 == 0.0) {
+        t = 0;                  // v0 == v1 case
+    }
+    else {
+        // Consider the line extending the segment, parameterized as v + t (w - v).
+        // We find projection of point p onto the line. 
+        // It falls where t = [(p-v) . (w-v)] / |w-v|^2
+        t = dot(ip - v[0], dv) / l2;
+        if (t < 0.0) {
+            t = 0;                      // Beyond the 'v0' end of the segment
+        }
+        else if (t > 1.0) {
+            t = 1;                      // Beyond the 'v1' end of the segment
+        }
+    }
+    
+    Vector2 projection = v[0] + t * dv; // Projection falls on the segment
+
+    float d = distance(ip, projection);
+
+    return cb(param, x, y, t, saturate(1-d));
+}
+
+
+void nv::Raster::drawLine(bool antialias, Vector2::Arg extents, bool enableScissors, const Vector2 v[2], LineSamplingCallback cb, void * param)
+{
+    nvCheck(antialias == true);         // @@ Not implemented.
+    //nvCheck(enableScissors == false); // @@ Not implemented.
+
+    // Very crappy DDA implementation.
+
+    Vector2 p = v[0];
+    Vector2 dp, dpdy;
+
+    float dx = v[1].x - v[0].x;
+    float dy = v[1].y - v[0].y;
+    int n;
+
+    // Degenerate line.
+    if (dx == 0 && dy == 0) return;
+
+    if (fabsf(dx) >= fabsf(dy)) {
+        n = iround(fabsf(dx));
+        dp.x = dx / fabsf(dx);
+        dp.y = dy / fabsf(dx);
+        nvDebugCheck(fabsf(dp.y) <= 1.0f);
+        dpdy.x = 0;
+        dpdy.y = 1;
+    }
+    else {
+        n = iround(fabs(dy));
+        dp.x = dx / fabsf(dy);
+        dp.y = dy / fabsf(dy);
+        nvDebugCheck(fabsf(dp.x) <= 1.0f);
+        dpdy.x = 1;
+        dpdy.y = 0;
+    }
+
+    for (int i = 0; i <= n; i++) {
+        drawPoint(p, v, cb, param);
+        drawPoint(p + dpdy, v, cb, param);
+        drawPoint(p - dpdy, v, cb, param);
+        p += dp;
+    }
+}
+
+
+// Draw vertical or horizontal segments. For degenerate triangles.
+/*bool nv::Raster::drawSegment(Vector2::Arg extents, bool enableScissors, const Vector2 v[2], LineSamplingCallback cb, void * param)
+{
+    nvCheck(enableScissors == false);
+
+    
+    if (v[0].x == v[1].x) {         // Vertical segment.
+        
+    }
+    else if (v[0].y == v[1].y) {    // Horizontal segment.
+        int y = ftoi_round(v[0].y);
+        int x0 = ftoi_floor(v[0].x);
+        int x1 = ftoi_floor(v[0].x);
+
+        for (int x = x0; x <= x1; x++) {
+
+            cb(param, x, y, t, 
+        }
+    }
+
+    return false; // Not a valid segment.
+}
+*/
diff --git a/thirdparty/thekla_atlas/nvmesh/raster/Raster.h b/thirdparty/thekla_atlas/nvmesh/raster/Raster.h
new file mode 100644
index 0000000000..05af2ddb00
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/raster/Raster.h
@@ -0,0 +1,49 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MESH_RASTER_H
+#define NV_MESH_RASTER_H
+
+/** @file Raster.h
+ * @brief Rasterization library.
+ *
+ * This is just a standard scanline rasterizer that I took from one of my old
+ * projects. The perspective correction wasn't necessary so I just removed it.
+**/
+
+#include "nvmath/Vector.h"
+#include "nvmesh/nvmesh.h"
+
+namespace nv
+{
+
+    namespace Raster 
+    {
+        enum Mode {
+            Mode_Nearest,
+            Mode_Antialiased,
+            //Mode_Conservative
+        };
+
+
+        /// A callback to sample the environment. Return false to terminate rasterization.
+        typedef bool (NV_CDECL * SamplingCallback)(void * param, int x, int y, Vector3::Arg bar, Vector3::Arg dx, Vector3::Arg dy, float coverage);
+
+        // Process the given triangle. Returns false if rasterization was interrupted by the callback.
+        NVMESH_API bool drawTriangle(Mode mode, Vector2::Arg extents, bool enableScissors, const Vector2 v[3], SamplingCallback cb, void * param);
+
+        // Process the given quad. Returns false if rasterization was interrupted by the callback.
+        NVMESH_API bool drawQuad(Mode mode, Vector2::Arg extents, bool enableScissors, const Vector2 v[4], SamplingCallback cb, void * param);
+
+        typedef bool (NV_CDECL * LineSamplingCallback)(void * param, int x, int y, float t, float d);    // t is the position along the segment, d is the distance to the line.
+
+        // Process the given line.
+        NVMESH_API void drawLine(bool antialias, Vector2::Arg extents, bool enableScissors, const Vector2 v[2], LineSamplingCallback cb, void * param);
+
+        // Draw vertical or horizontal segments. For degenerate triangles.
+        //NVMESH_API void drawSegment(Vector2::Arg extents, bool enableScissors, const Vector2 v[2], SamplingCallback cb, void * param);
+    }
+}
+
+
+#endif // NV_MESH_RASTER_H
diff --git a/thirdparty/thekla_atlas/nvmesh/weld/Snap.cpp b/thirdparty/thekla_atlas/nvmesh/weld/Snap.cpp
new file mode 100644
index 0000000000..b6bff4d83d
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/weld/Snap.cpp
@@ -0,0 +1,100 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include <nvcore/RadixSort.h>
+
+#include <nvmesh/weld/Snap.h>
+#include <nvmesh/TriMesh.h>
+#include <nvmesh/geometry/Bounds.h>
+
+using namespace nv;
+
+namespace {
+	
+	// Snap the given vertices.
+	void Snap(TriMesh::Vertex & a, TriMesh::Vertex & b, float texThreshold, float norThreshold)
+	{
+		a.pos = b.pos = (a.pos + b.pos) * 0.5f;
+		
+		if (equal(a.tex.x, b.tex.x, texThreshold) && equal(a.tex.y, b.tex.y, texThreshold)) {
+			b.tex = a.tex = (a.tex + b.tex) * 0.5f;
+		}
+		
+		if (equal(a.nor.x, b.nor.x, norThreshold) && equal(a.nor.y, b.nor.y, norThreshold) && equal(a.nor.z, b.nor.z, norThreshold)) {
+			b.nor = a.nor = (a.nor + b.nor) * 0.5f;
+		}
+	};
+
+} // nv namespace
+
+uint nv::SnapVertices(TriMesh * mesh, float posThreshold, float texThreshold, float norThreshold)
+{
+	nvDebug("--- Snapping vertices.\n");
+	
+	// Determine largest axis.
+	Box box = MeshBounds::box(mesh);
+	Vector3 extents = box.extents();
+
+	int axis = 2;
+	if( extents.x > extents.y ) {
+		if( extents.x > extents.z ) {
+			axis = 0;
+		}
+	}
+	else if(extents.y > extents.z) {
+		axis = 1;
+	}
+	
+	// @@ Use diagonal instead!
+	
+
+	// Sort vertices according to the largest axis.
+	const uint vertexCount = mesh->vertexCount();
+	nvCheck(vertexCount > 2); // Must have at least two vertices.
+
+	// Get pos channel.
+	//PiMesh::Channel * pos_channel = mesh->GetChannel(mesh->FindChannel(VS_POS));
+	//nvCheck( pos_channel != NULL );
+
+	//const PiArray<Vec4> & pos_array = pos_channel->data;
+
+	Array<float> distArray;
+	distArray.resize(vertexCount);
+
+	for(uint v = 0; v < vertexCount; v++) {
+		if (axis == 0) distArray[v] = mesh->vertexAt(v).pos.x;
+		else if (axis == 1) distArray[v] = mesh->vertexAt(v).pos.y;
+		else distArray[v] = mesh->vertexAt(v).pos.z;
+	}
+
+	RadixSort radix;
+	const uint * xrefs = radix.sort(distArray.buffer(), distArray.count()).ranks();
+	nvCheck(xrefs != NULL);
+
+	uint snapCount = 0;
+	for(uint v = 0; v < vertexCount-1; v++) {
+		for(uint n = v+1; n < vertexCount; n++) {
+			nvDebugCheck( distArray[xrefs[v]] <= distArray[xrefs[n]] );
+			
+			if (fabs(distArray[xrefs[n]] - distArray[xrefs[v]]) > posThreshold) {
+				break;
+			}
+			
+			TriMesh::Vertex & v0 = mesh->vertexAt(xrefs[v]);
+			TriMesh::Vertex & v1 = mesh->vertexAt(xrefs[n]);
+			
+			const float dist = length(v0.pos - v1.pos);
+			
+			if (dist <= posThreshold) {
+				Snap(v0, v1, texThreshold, norThreshold);
+				snapCount++;
+			}
+		}
+	}
+
+	// @@ todo: debug, make sure that the distance between vertices is now >= threshold
+
+	nvDebug("---   %u vertices snapped\n", snapCount);
+
+	return snapCount;
+};
+
diff --git a/thirdparty/thekla_atlas/nvmesh/weld/Snap.h b/thirdparty/thekla_atlas/nvmesh/weld/Snap.h
new file mode 100644
index 0000000000..8e0566cda3
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/weld/Snap.h
@@ -0,0 +1,18 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MESH_SNAP_H
+#define NV_MESH_SNAP_H
+
+#include <nvmesh/nvmesh.h>
+#include <nvmath/nvmath.h>
+
+namespace nv
+{
+	class TriMesh;
+
+	NVMESH_API uint SnapVertices(TriMesh * mesh, float posThreshold=NV_EPSILON, float texThreshold=1.0f/1024, float norThreshold=NV_NORMAL_EPSILON);
+
+} // nv namespace
+
+
+#endif // NV_MESH_SNAP_H
diff --git a/thirdparty/thekla_atlas/nvmesh/weld/VertexWeld.cpp b/thirdparty/thekla_atlas/nvmesh/weld/VertexWeld.cpp
new file mode 100644
index 0000000000..2ba4dcae18
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/weld/VertexWeld.cpp
@@ -0,0 +1,205 @@
+// Copyright NVIDIA Corporation 2006 -- Ignacio Castano <icastano@nvidia.com>
+
+#include <nvmesh/TriMesh.h>
+#include <nvmesh/QuadTriMesh.h>
+
+#include <nvmesh/weld/VertexWeld.h>
+#include <nvmesh/weld/Weld.h>
+
+using namespace nv;
+
+// Weld trimesh vertices
+void nv::WeldVertices(TriMesh * mesh)
+{
+	nvDebug("--- Welding vertices.\n");
+	
+	nvCheck(mesh != NULL);
+
+	uint count = mesh->vertexCount();
+	Array<uint> xrefs;
+	Weld<TriMesh::Vertex> weld;
+	uint newCount = weld(mesh->vertices(), xrefs);
+	
+	nvDebug("---   %d vertices welded\n", count - newCount);
+	
+	
+	// Remap faces.
+	const uint faceCount = mesh->faceCount();
+	for(uint f = 0; f < faceCount; f++)
+	{
+		TriMesh::Face & face = mesh->faceAt(f);
+		face.v[0] = xrefs[face.v[0]];
+		face.v[1] = xrefs[face.v[1]];
+		face.v[2] = xrefs[face.v[2]];
+	}
+}
+
+
+// Weld trimesh vertices
+void nv::WeldVertices(QuadTriMesh * mesh)
+{
+	nvDebug("--- Welding vertices.\n");
+	
+	nvCheck(mesh != NULL);
+
+	uint  count = mesh->vertexCount();
+	Array<uint> xrefs;
+	Weld<TriMesh::Vertex> weld;
+	uint newCount = weld(mesh->vertices(), xrefs);
+	
+	nvDebug("---   %d vertices welded\n", count - newCount);
+	
+	// Remap faces.
+	const uint faceCount = mesh->faceCount();
+	for(uint f = 0; f < faceCount; f++)
+	{
+		QuadTriMesh::Face & face = mesh->faceAt(f);
+		face.v[0] = xrefs[face.v[0]];
+		face.v[1] = xrefs[face.v[1]];
+		face.v[2] = xrefs[face.v[2]];
+		
+		if (face.isQuadFace())
+		{
+			face.v[3] = xrefs[face.v[3]];
+		}
+	}
+}
+
+
+
+// OLD code
+
+#if 0
+
+namespace {
+
+struct VertexInfo {
+	uint id;			///< Original vertex id.
+	uint normal_face_group;
+	uint tangent_face_group;
+	uint material;
+	uint chart;
+};
+
+
+/// VertexInfo hash functor.
+struct VertexHash : public IHashFunctor<VertexInfo> {
+	VertexHash(PiMeshPtr m) : mesh(m) {
+		uint c = mesh->FindChannel(VS_POS);
+		piCheck(c != PI_NULL_INDEX);
+		channel = mesh->GetChannel(c);
+		piCheck(channel != NULL);
+	}
+
+	uint32 operator () (const VertexInfo & v) const {
+		return channel->data[v.id].GetHash();
+	}
+	
+private:
+	PiMeshPtr mesh;
+	PiMesh::Channel * channel;
+};
+
+
+/// VertexInfo comparator.
+struct VertexEqual : public IBinaryPredicate<VertexInfo> {
+	VertexEqual(PiMeshPtr m) : mesh(m) {}
+	
+	bool operator () (const VertexInfo & a, const VertexInfo & b) const {
+
+		bool equal = a.normal_face_group == b.normal_face_group && 
+			a.tangent_face_group == b.tangent_face_group &&
+			a.material == b.material && 
+			a.chart == b.chart;
+		
+		// Split vertex shared by different face types.
+		if( !equal ) {
+			return false;
+		}
+		
+		// They were the same vertex.
+		if( a.id == b.id ) {
+			return true;
+		}
+		
+		// Vertex equal if all the channels are equal.
+		return mesh->IsVertexEqual(a.id, b.id);
+	}
+
+private:	
+	PiMeshPtr mesh;
+};
+
+} // namespace
+
+
+/// Weld the vertices.
+void PiMeshVertexWeld::WeldVertices(const PiMeshSmoothGroup * mesh_smooth_group, 
+	const PiMeshMaterial * mesh_material, const PiMeshAtlas * mesh_atlas ) 
+{
+	piDebug( "--- Welding vertices:\n" );
+
+	piDebug( "---   Expand mesh vertices.\n" );
+	PiArray<VertexInfo> vertex_array;
+
+	const uint face_num = mesh->GetFaceNum();
+	const uint vertex_max = face_num * 3;
+	vertex_array.Resize( vertex_max );
+
+	for(uint i = 0; i < vertex_max; i++) {
+
+		uint f = i/3;
+	
+		const PiMesh::Face & face = mesh->GetFace(f);
+		vertex_array[i].id = face.v[i%3];
+
+		// Reset face attributes.
+		vertex_array[i].normal_face_group = PI_NULL_INDEX;
+		vertex_array[i].tangent_face_group = PI_NULL_INDEX;
+		vertex_array[i].material = PI_NULL_INDEX;
+		vertex_array[i].chart = PI_NULL_INDEX;
+		
+		// Set available attributes.
+		if( mesh_smooth_group != NULL ) {
+			if( mesh_smooth_group->HasNormalFaceGroups() ) {
+				vertex_array[i].normal_face_group = mesh_smooth_group->GetNormalFaceGroup( f );
+			}
+			if( mesh_smooth_group->HasTangentFaceGroups() ) {
+				vertex_array[i].tangent_face_group = mesh_smooth_group->GetTangentFaceGroup( f );
+			}
+		}
+		if( mesh_material != NULL ) {
+			vertex_array[i].material = mesh_material->GetFaceMaterial( f );
+		}
+		if( mesh_atlas != NULL && mesh_atlas->HasCharts() ) {
+			vertex_array[i].chart = mesh_atlas->GetFaceChart( f );
+		}
+	}
+	piDebug( "---   %d vertices.\n", vertex_max );
+
+	piDebug( "---   Collapse vertices.\n" );
+
+	uint * xrefs = new uint[vertex_max];
+	VertexHash hash(mesh);
+	VertexEqual equal(mesh);
+	const uint vertex_num = Weld( vertex_array, xrefs, hash, equal );
+	piCheck(vertex_num <= vertex_max);
+	piDebug( "---   %d vertices.\n", vertex_num );	
+	
+	// Remap face indices.
+	piDebug( "---   Remapping face indices.\n" );
+	mesh->RemapFaceIndices(vertex_max, xrefs);
+
+
+	// Overwrite xrefs to map new vertices to old vertices.
+	for(uint v = 0; v < vertex_num; v++) {
+		xrefs[v] = vertex_array[v].id;
+	}
+	
+	// Update vertex order.
+	mesh->ReorderVertices(vertex_num, xrefs);
+
+	delete [] xrefs;
+}
+
+#endif // 0
diff --git a/thirdparty/thekla_atlas/nvmesh/weld/VertexWeld.h b/thirdparty/thekla_atlas/nvmesh/weld/VertexWeld.h
new file mode 100644
index 0000000000..1dc2e4ba4d
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/weld/VertexWeld.h
@@ -0,0 +1,19 @@
+// Copyright NVIDIA Corporation 2006 -- Ignacio Castano <icastano@nvidia.com>
+
+#ifndef NV_MESH_VERTEXWELD_H
+#define NV_MESH_VERTEXWELD_H
+
+#include <nvmesh/nvmesh.h>
+
+namespace nv
+{
+	class TriMesh;
+	class QuadMesh;
+
+	NVMESH_API void WeldVertices(TriMesh * mesh);
+	NVMESH_API void WeldVertices(QuadTriMesh * mesh);
+
+} // nv namespace
+
+
+#endif // NV_MESH_VERTEXWELD_H
diff --git a/thirdparty/thekla_atlas/nvmesh/weld/Weld.h b/thirdparty/thekla_atlas/nvmesh/weld/Weld.h
new file mode 100644
index 0000000000..e615539461
--- /dev/null
+++ b/thirdparty/thekla_atlas/nvmesh/weld/Weld.h
@@ -0,0 +1,171 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MESH_WELD_H
+#define NV_MESH_WELD_H
+
+#include "nvcore/Array.h"
+#include "nvcore/Hash.h"
+#include "nvcore/Utils.h" // nextPowerOfTwo
+
+#include <string.h> // for memset, memcmp, memcpy
+
+// Weld function to remove array duplicates in linear time using hashing.
+
+namespace nv
+{
+
+/// Generic welding routine. This function welds the elements of the array p
+/// and returns the cross references in the xrefs array. To compare the elements
+/// it uses the given hash and equal functors.
+/// 
+/// This code is based on the ideas of Ville Miettinen and Pierre Terdiman.
+template <class T, class H=Hash<T>, class E=Equal<T> >
+struct Weld
+{
+	// xrefs maps old elements to new elements
+	uint operator()(Array<T> & p, Array<uint> & xrefs)
+	{
+		const uint N = p.size();							// # of input vertices.
+		uint outputCount = 0;								// # of output vertices
+		uint hashSize = nextPowerOfTwo(N);					// size of the hash table
+		uint * hashTable = new uint[hashSize + N];			// hash table + linked list
+		uint * next = hashTable + hashSize;					// use bottom part as linked list
+
+		xrefs.resize(N);
+		memset( hashTable, NIL, hashSize*sizeof(uint) );	// init hash table (NIL = 0xFFFFFFFF so memset works)
+
+		H hash;
+		E equal;
+		for (uint i = 0; i < N; i++)
+		{
+			const T & e = p[i];
+			uint32 hashValue = hash(e) & (hashSize-1);
+			uint offset = hashTable[hashValue];
+
+			// traverse linked list
+			while( offset != NIL && !equal(p[offset], e) )
+			{
+				offset = next[offset];
+			}
+
+			xrefs[i] = offset;
+
+			// no match found - copy vertex & add to hash
+			if( offset == NIL )
+			{
+				// save xref
+				xrefs[i] = outputCount;
+
+				// copy element
+				p[outputCount] = e;
+
+				// link to hash table
+				next[outputCount] = hashTable[hashValue];
+
+				// update hash heads and increase output counter
+				hashTable[hashValue] = outputCount++;
+			}
+		}
+
+		// cleanup
+		delete [] hashTable;
+
+		p.resize(outputCount);
+		
+		// number of output vertices
+		return outputCount;
+	}
+};
+
+
+/// Reorder the given array accoding to the indices given in xrefs.
+template <class T>
+void reorderArray(Array<T> & array, const Array<uint> & xrefs)
+{
+	const uint count = xrefs.count();
+	Array<T> new_array;
+    new_array.resize(count);
+
+	for(uint i = 0; i < count; i++) {
+		new_array[i] = array[xrefs[i]];
+	}
+
+	swap(array, new_array);
+}
+
+/// Reverse the given array so that new indices point to old indices.
+inline void reverseXRefs(Array<uint> & xrefs, uint count)
+{
+	Array<uint> new_xrefs;
+    new_xrefs.resize(count);
+	
+	for(uint i = 0; i < xrefs.count(); i++) {
+		new_xrefs[xrefs[i]] = i;
+	}
+	
+	swap(xrefs, new_xrefs);
+}
+
+
+
+//
+struct WeldN
+{
+    uint vertexSize;
+
+    WeldN(uint n) : vertexSize(n) {}
+
+	// xrefs maps old elements to new elements
+	uint operator()(uint8 * ptr, uint N, Array<uint> & xrefs)
+	{
+		uint outputCount = 0;								// # of output vertices
+		uint hashSize = nextPowerOfTwo(N);					// size of the hash table
+		uint * hashTable = new uint[hashSize + N];			// hash table + linked list
+		uint * next = hashTable + hashSize;					// use bottom part as linked list
+
+		xrefs.resize(N);
+		memset( hashTable, NIL, hashSize*sizeof(uint) );	// init hash table (NIL = 0xFFFFFFFF so memset works)
+
+		for (uint i = 0; i < N; i++)
+		{
+			const uint8 * vertex = ptr + i * vertexSize;
+			uint32 hashValue = sdbmHash(vertex, vertexSize) & (hashSize-1);
+			uint offset = hashTable[hashValue];
+
+			// traverse linked list
+			while (offset != NIL && memcmp(ptr + offset * vertexSize, vertex, vertexSize) != 0)
+			{
+				offset = next[offset];
+			}
+
+			xrefs[i] = offset;
+
+			// no match found - copy vertex & add to hash
+			if (offset == NIL)
+			{
+				// save xref
+				xrefs[i] = outputCount;
+
+				// copy element
+                memcpy(ptr + outputCount * vertexSize, vertex, vertexSize);
+
+				// link to hash table
+				next[outputCount] = hashTable[hashValue];
+
+				// update hash heads and increase output counter
+				hashTable[hashValue] = outputCount++;
+			}
+		}
+
+		// cleanup
+		delete [] hashTable;
+
+		// number of output vertices
+		return outputCount;
+	}
+};
+
+
+} // nv namespace
+
+#endif // NV_MESH_WELD_H
diff --git a/thirdparty/thekla_atlas/poshlib/posh.c b/thirdparty/thekla_atlas/poshlib/posh.c
new file mode 100644
index 0000000000..bd3fcc66ea
--- /dev/null
+++ b/thirdparty/thekla_atlas/poshlib/posh.c
@@ -0,0 +1,1006 @@
+/*
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/** 
+ @file    posh.c
+ @author  Brian Hook
+ @date    2002
+ @brief   Portable Open Source Harness primary source file
+*/
+#include "posh.h"
+
+#if !defined FORCE_DOXYGEN
+
+#if !defined POSH_NO_FLOAT
+#  define POSH_FLOAT_STRING "enabled"
+#else
+#  define POSH_FLOAT_STRING "disabled"
+#endif
+
+#if defined POSH_64BIT_INTEGER
+#  define POSH_64BIT_INTEGER_STRING "yes"
+#else
+#  define POSH_64BIT_INTEGER_STRING "no"
+#endif
+
+#if defined POSH_64BIT_POINTER
+#  define POSH_POINTER_STRING "64-bits"
+#else
+#  define POSH_POINTER_STRING "32-bits"
+#endif
+
+#if defined POSH_LITTLE_ENDIAN
+#  define IS_BIG_ENDIAN    0
+
+#  define NATIVE16  POSH_LittleU16
+#  define NATIVE32  POSH_LittleU32
+#  define NATIVE64  POSH_LittleU64
+#  define FOREIGN16 POSH_BigU16
+#  define FOREIGN32 POSH_BigU32
+#  define FOREIGN64 POSH_BigU64
+#else
+#  define IS_BIG_ENDIAN    1
+
+#  define NATIVE16  POSH_BigU16
+#  define NATIVE32  POSH_BigU32
+#  define NATIVE64  POSH_BigU64
+#  define FOREIGN16 POSH_LittleU16
+#  define FOREIGN32 POSH_LittleU32
+#  define FOREIGN64 POSH_LittleU64
+#endif /* POSH_LITTLE_ENDIAN */
+
+static 
+int 
+s_testBigEndian( void )
+{
+   union 
+   {
+      posh_byte_t c[ 4 ];
+      posh_u32_t  i;
+   } u;
+
+   u.i= 1;
+
+   if ( u.c[ 0 ] == 1 )
+   {
+      return 0;
+   }
+   return 1;
+}
+
+static
+const char *
+s_testSerialization( void )
+{
+   posh_byte_t serbuf[ 8 ];
+   posh_u16_t  tmp16;
+   posh_u32_t  tmp32;
+
+   /* 16-bit serialization */
+   POSH_WriteU16ToLittle( serbuf, 0xABCD );
+   if ( ( tmp16 = POSH_ReadU16FromLittle( serbuf ) ) != 0xABCD )
+   {
+      return "*ERROR: failed little-endian 16-bit serialization test";
+   }
+
+   POSH_WriteU16ToBig( serbuf, 0xABCD );
+   if ( ( tmp16 = POSH_ReadU16FromBig( serbuf ) ) != 0xABCD )
+   {
+      return "*ERROR: failed big-endian 16-bit serialization test";
+   }
+
+   /* 32-bit serialization */
+   POSH_WriteU32ToLittle( serbuf, 0xABCD1234L );
+   if ( ( tmp32 = POSH_ReadU32FromLittle( serbuf ) ) != 0xABCD1234 )
+   {
+      return "*ERROR: failed little-endian 32-bit serialization test";
+   }
+
+   POSH_WriteU32ToBig( serbuf, 0xABCD1234L );
+   if ( ( tmp32 = POSH_ReadU32FromBig( serbuf ) ) != 0xABCD1234 )
+   {
+      return "*ERROR: failed big-endian 32-bit serialization test";
+   }
+
+#if defined POSH_64BIT_INTEGER
+   {
+#define REF64 POSH_U64(0xFEDCBA9876543210)
+
+      posh_u64_t tmp64;
+
+      POSH_WriteU64ToLittle( serbuf, REF64 );
+
+      if ( ( tmp64 = POSH_ReadU64FromLittle( serbuf ) ) != REF64 )
+      {
+         return "*ERROR: failed little-endian 64-bit serialization test";
+      }
+
+      POSH_WriteU64ToBig( serbuf, REF64 );
+
+      if ( ( tmp64 = POSH_ReadU64FromBig( serbuf ) ) != REF64 )
+      {
+         return "*ERROR: failed big-endian 64-bit serialization test";
+      }
+   }
+#endif
+
+   return 0;
+}
+
+#if !defined POSH_NO_FLOAT
+static
+const char *
+s_testFloatingPoint( void )
+{
+   float fRef = 10.0f/30.0f;
+   double dRef = 10.0/30.0;
+   posh_byte_t dbuf[ 8 ];
+   float fTmp;
+   double dTmp;
+
+   fTmp = POSH_FloatFromLittleBits( POSH_LittleFloatBits( fRef ) );
+
+   if ( fTmp != fRef )
+   {
+      return "*ERROR: POSH little endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   fTmp = POSH_FloatFromBigBits( POSH_BigFloatBits( fRef ) );
+   if ( fTmp != fRef )
+   {
+      return "*ERROR: POSH big endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   POSH_DoubleBits( dRef, dbuf );
+
+   dTmp = POSH_DoubleFromBits( dbuf );
+
+   if ( dTmp != dRef )
+   {
+      return "*ERROR: POSH double precision floating point serialization failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   return 0;
+}
+#endif /* !defined POSH_NO_FLOAT */
+
+static
+const char *
+s_testEndianess( void )
+{
+   /* check endianess */
+   if ( s_testBigEndian() != IS_BIG_ENDIAN )
+   {
+      return "*ERROR: POSH compile time endianess does not match run-time endianess verification.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   /* make sure our endian swap routines work */
+   if ( ( NATIVE32( 0x11223344L ) != 0x11223344L ) || 
+        ( FOREIGN32( 0x11223344L ) != 0x44332211L ) ||
+        ( NATIVE16( 0x1234 ) != 0x1234 ) ||
+        ( FOREIGN16( 0x1234 ) != 0x3412 ) )
+   {
+      return "*ERROR: POSH endianess macro selection failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   /* test serialization routines */
+
+   return 0;
+}
+#endif /* !defined FORCE_DOXYGEN */
+
+/**
+  Returns a string describing this platform's basic attributes.  
+
+  POSH_GetArchString() reports on an architecture's statically determined
+  attributes.  In addition, it will perform run-time verification checks
+  to make sure the various platform specific functions work.  If an error
+  occurs, please contact me at poshlib@poshlib.org so we can try to resolve
+  what the specific failure case is.
+  @returns a string describing this platform on success, or a string in the 
+           form "*ERROR: [text]" on failure.  You can simply check to see if
+           the first character returned is '*' to verify an error condition.
+*/
+const char *
+POSH_GetArchString( void )
+{
+   const char *err;
+   const char *s = "OS:.............."POSH_OS_STRING"\n"
+                   "CPU:............."POSH_CPU_STRING"\n"
+                   "endian:.........."POSH_ENDIAN_STRING"\n"
+                   "ptr size:........"POSH_POINTER_STRING"\n"
+                   "64-bit ints......"POSH_64BIT_INTEGER_STRING"\n"
+                   "floating point..."POSH_FLOAT_STRING"\n"
+                   "compiler........."POSH_COMPILER_STRING"\n";
+
+   /* test endianess */
+   err = s_testEndianess();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+   /* test serialization */
+   err = s_testSerialization();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+#if !defined POSH_NO_FLOAT
+   /* check that our floating point support is correct */
+   err = s_testFloatingPoint();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+#endif
+
+   return s;
+}
+
+/* ---------------------------------------------------------------------------*/
+/*                           BYTE SWAPPING SUPPORT                            */
+/* ---------------------------------------------------------------------------*/
+/** 
+ * Byte swaps a 16-bit unsigned value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] unsigned 16-bit input value to swap
+   @returns a byte swapped version of v
+ */
+posh_u16_t
+POSH_SwapU16( posh_u16_t v )
+{
+   posh_u16_t swapped;
+
+   swapped  = v << 8;
+   swapped |= v >> 8;
+
+   return swapped;
+}
+
+/** 
+ * Byte swaps a 16-bit signed value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] signed 16-bit input value to swap
+   @returns a byte swapped version of v
+   @remarks This just calls back to the unsigned version, since byte swapping 
+            is independent of sign.  However, we still provide this function to
+            avoid signed/unsigned mismatch compiler warnings.
+ */
+posh_i16_t
+POSH_SwapI16( posh_i16_t v )
+{
+   return ( posh_i16_t ) POSH_SwapU16( v );
+}
+
+/** 
+ * Byte swaps a 32-bit unsigned value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] unsigned 32-bit input value to swap
+   @returns a byte swapped version of v
+ */
+posh_u32_t
+POSH_SwapU32( posh_u32_t v )
+{
+   posh_u32_t swapped;
+
+   swapped  = ( v & 0xFF ) << 24;
+   swapped |= ( v & 0xFF00 ) << 8;
+   swapped |= ( v >> 8 ) & 0xFF00;
+   swapped |= ( v >> 24 );
+
+   return swapped;
+}
+
+/** 
+ * Byte swaps a 32-bit signed value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] signed 32-bit input value to swap
+   @returns a byte swapped version of v
+   @remarks This just calls back to the unsigned version, since byte swapping 
+            is independent of sign.  However, we still provide this function to
+            avoid signed/unsigned mismatch compiler warnings.
+ */
+posh_i32_t
+POSH_SwapI32( posh_i32_t v )
+{
+   return ( posh_i32_t ) POSH_SwapU32( ( posh_u32_t ) v );
+}
+
+#if defined POSH_64BIT_INTEGER
+/**
+ * Byte swaps a 64-bit unsigned value
+
+   @param v [in] a 64-bit input value to swap
+   @ingroup SixtyFourBit
+   @returns a byte swapped version of v
+*/
+posh_u64_t 
+POSH_SwapU64( posh_u64_t v )
+{
+   posh_byte_t tmp;
+   union {
+      posh_byte_t bytes[ 8 ];
+      posh_u64_t  u64;
+   } u;
+
+   u.u64 = v;
+
+   tmp = u.bytes[ 0 ]; u.bytes[ 0 ] = u.bytes[ 7 ]; u.bytes[ 7 ] = tmp;
+   tmp = u.bytes[ 1 ]; u.bytes[ 1 ] = u.bytes[ 6 ]; u.bytes[ 6 ] = tmp;
+   tmp = u.bytes[ 2 ]; u.bytes[ 2 ] = u.bytes[ 5 ]; u.bytes[ 5 ] = tmp;
+   tmp = u.bytes[ 3 ]; u.bytes[ 3 ] = u.bytes[ 4 ]; u.bytes[ 4 ] = tmp;
+
+   return u.u64;
+}
+
+/**
+ * Byte swaps a 64-bit signed value
+
+   @param v [in] a 64-bit input value to swap
+   @ingroup SixtyFourBit
+   @returns a byte swapped version of v
+*/
+posh_i64_t 
+POSH_SwapI64( posh_i64_t v )
+{
+   return ( posh_i64_t ) POSH_SwapU64( ( posh_u64_t ) v );
+}
+
+#endif /* defined POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                           IN-MEMORY SERIALIZATION                          */
+/* ---------------------------------------------------------------------------*/
+
+/**
+ * Writes an unsigned 16-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL.  Alignment doesn't matter.
+ @param value [in] host-endian unsigned 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs
+*/
+posh_u16_t *
+POSH_WriteU16ToLittle( void *dst, posh_u16_t value )
+{
+   posh_u16_t  *p16 = ( posh_u16_t * ) dst;
+   posh_byte_t *p   = ( posh_byte_t * ) dst;
+
+   p[ 0 ] = value & 0xFF;
+   p[ 1 ] = ( value & 0xFF00) >> 8;
+
+   return p16 + 1;
+}
+
+/**
+ * Writes a signed 16-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU16ToLittle() with appropriate casting.
+*/
+posh_i16_t *
+POSH_WriteI16ToLittle( void *dst, posh_i16_t value )
+{
+   return ( posh_i16_t * ) POSH_WriteU16ToLittle( dst, ( posh_u16_t ) value );
+}
+
+/**
+ * Writes an unsigned 32-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u32_t *
+POSH_WriteU32ToLittle( void *dst, posh_u32_t value )
+{
+   posh_u32_t  *p32   = ( posh_u32_t * ) dst;
+   posh_byte_t *p     = ( posh_byte_t * ) dst;
+
+   p[ 0 ] = ( value & 0xFF );
+   p[ 1 ] = ( value & 0xFF00 ) >> 8;
+   p[ 2 ] = ( value & 0xFF0000 ) >> 16;
+   p[ 3 ] = ( value & 0xFF000000 ) >> 24;
+
+   return p32 + 1;
+}
+
+/**
+ * Writes a signed 32-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU32ToLittle() with appropriate casting.
+*/
+posh_i32_t *
+POSH_WriteI32ToLittle( void *dst, posh_i32_t value )
+{
+   return ( posh_i32_t * ) POSH_WriteU32ToLittle( dst, ( posh_u32_t ) value );
+}
+
+/**
+ * Writes an unsigned 16-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs
+*/
+posh_u16_t *
+POSH_WriteU16ToBig( void *dst, posh_u16_t value )
+{
+   posh_u16_t *p16 = ( posh_u16_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+
+   p[ 1 ] = ( value & 0xFF );
+   p[ 0 ] = ( value & 0xFF00 ) >> 8;
+
+   return p16 + 1;
+}
+
+/**
+ * Writes a signed 16-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU16ToLittle() with appropriate casting.
+*/
+posh_i16_t *
+POSH_WriteI16ToBig( void *dst, posh_i16_t value )
+{
+   return ( posh_i16_t * ) POSH_WriteU16ToBig( dst, ( posh_u16_t ) value );
+}
+
+/**
+ * Writes an unsigned 32-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u32_t *
+POSH_WriteU32ToBig( void *dst, posh_u32_t value )
+{
+   posh_u32_t *p32 = ( posh_u32_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+
+   p[ 3 ] = ( value & 0xFF );
+   p[ 2 ] = ( value & 0xFF00 ) >> 8;
+   p[ 1 ] = ( value & 0xFF0000 ) >> 16;
+   p[ 0 ] = ( value & 0xFF000000 ) >> 24;
+
+   return p32 + 1;
+}
+
+/**
+ * Writes a signed 32-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU32ToBig() with appropriate casting.
+*/
+posh_i32_t *
+POSH_WriteI32ToBig( void *dst, posh_i32_t value )
+{
+   return ( posh_i32_t * ) POSH_WriteU32ToBig( dst, ( posh_u32_t ) value );
+}
+
+#if defined POSH_64BIT_INTEGER
+/**
+ * Writes an unsigned 64-bit value to a little-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u64_t *
+POSH_WriteU64ToLittle( void *dst, posh_u64_t value )
+{
+   posh_u64_t *p64 = ( posh_u64_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+   int i;
+
+   for ( i = 0; i < 8; i++, value >>= 8 )
+   {
+       p[ i ] = ( posh_byte_t ) ( value & 0xFF );
+   }
+
+   return p64 + 1;
+}
+
+/**
+ * Writes a signed 64-bit value to a little-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_i64_t *
+POSH_WriteI64ToLittle( void *dst, posh_i64_t value )
+{
+   return ( posh_i64_t * ) POSH_WriteU64ToLittle( dst, ( posh_u64_t ) value );
+}
+
+/**
+ * Writes an unsigned 64-bit value to a big-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u64_t *
+POSH_WriteU64ToBig( void *dst, posh_u64_t value )
+{
+   posh_u64_t *p64 = ( posh_u64_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+   int i;
+
+   for ( i = 0; i < 8; i++, value >>= 8 )
+   {
+       p[ 7-i ] = ( posh_byte_t ) ( value & 0xFF );
+   }
+
+   return p64 + 8;
+}
+
+/**
+ * Writes a signed 64-bit value to a big-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_i64_t *
+POSH_WriteI64ToBig( void *dst, posh_i64_t value )
+{
+   return ( posh_i64_t * ) POSH_WriteU64ToBig( dst, ( posh_u64_t ) value );
+}
+
+#endif /* POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                         IN-MEMORY DESERIALIZATION                          */
+/* ---------------------------------------------------------------------------*/
+
+/** 
+ * Reads an unsigned 16-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 16-bit value
+*/
+posh_u16_t  
+POSH_ReadU16FromLittle( const void *src )
+{
+    posh_u16_t   v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 0 ];
+    v |= ( ( posh_u16_t ) p[ 1 ] ) << 8;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 16-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 16-bit value
+*/
+posh_i16_t  
+POSH_ReadI16FromLittle( const void *src )
+{
+   return ( posh_i16_t ) POSH_ReadU16FromLittle( src );
+}
+
+/** 
+ * Reads an unsigned 32-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u32_t  
+POSH_ReadU32FromLittle( const void *src )
+{
+    posh_u32_t v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 0 ];
+    v |= ( ( posh_u32_t ) p[ 1 ] ) << 8;
+    v |= ( ( posh_u32_t ) p[ 2 ] ) << 16;
+    v |= ( ( posh_u32_t ) p[ 3 ] ) << 24;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 32-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i32_t  
+POSH_ReadI32FromLittle( const void *src )
+{
+   return ( posh_i32_t ) POSH_ReadU32FromLittle( src );
+}
+
+
+/** 
+ * Reads an unsigned 16-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 16-bit value
+*/
+posh_u16_t  
+POSH_ReadU16FromBig( const void *src )
+{
+    posh_u16_t   v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 1 ];
+    v |= ( ( posh_u16_t ) p[ 0 ] ) << 8;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 16-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 16-bit value
+*/
+posh_i16_t  
+POSH_ReadI16FromBig( const void *src )
+{
+   return ( posh_i16_t ) POSH_ReadU16FromBig( src );
+}
+
+/** 
+ * Reads an unsigned 32-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u32_t  
+POSH_ReadU32FromBig( const void *src )
+{
+    posh_u32_t   v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 3 ];
+    v |= ( ( posh_u32_t ) p[ 2 ] ) << 8;
+    v |= ( ( posh_u32_t ) p[ 1 ] ) << 16;
+    v |= ( ( posh_u32_t ) p[ 0 ] ) << 24;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 32-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i32_t  
+POSH_ReadI32FromBig( const void *src )
+{
+   return POSH_BigI32( (*(const posh_i32_t*)src ) );
+}
+
+#if defined POSH_64BIT_INTEGER
+
+/** 
+ * Reads an unsigned 64-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u64_t  
+POSH_ReadU64FromLittle( const void *src )
+{
+    posh_u64_t v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+    int i;
+
+    for ( i = 0; i < 8; i++ )
+    {
+        v |= ( ( posh_u64_t ) p[ i ] ) << (i*8);
+    }
+
+    return v;
+}
+
+/** 
+ * Reads a signed 64-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i64_t  
+POSH_ReadI64FromLittle( const void *src )
+{
+   return ( posh_i64_t ) POSH_ReadU64FromLittle( src );
+}
+
+/** 
+ * Reads an unsigned 64-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u64_t
+POSH_ReadU64FromBig( const void *src )
+{
+    posh_u64_t v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+    int i;
+
+    for ( i = 0; i < 8; i++ )
+    {
+        v |= ( ( posh_u64_t ) p[ 7-i ] ) << (i*8);
+    }
+
+    return v;
+}
+
+/** 
+ * Reads an signed 64-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i64_t
+POSH_ReadI64FromBig( const void *src )
+{
+   return ( posh_i64_t ) POSH_ReadU64FromBig( src );
+}
+
+#endif /* POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                           FLOATING POINT SUPPORT                           */
+/* ---------------------------------------------------------------------------*/
+
+#if !defined POSH_NO_FLOAT
+
+/** @ingroup FloatingPoint
+    @param[in] f floating point value
+    @returns a little-endian bit representation of f
+ */
+posh_u32_t
+POSH_LittleFloatBits( float f )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.f32 = f;
+
+   return POSH_LittleU32( u.u32 );
+}
+
+/** 
+ * Extracts raw big-endian bits from a 32-bit floating point value
+ *
+   @ingroup FloatingPoint
+   @param   f [in] floating point value
+   @returns a big-endian bit representation of f
+ */
+posh_u32_t
+POSH_BigFloatBits( float f )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.f32 = f;
+
+   return POSH_BigU32( u.u32 );
+}
+
+/** 
+ * Extracts raw, little-endian bit representation from a 64-bit double.
+ *
+   @param d [in] 64-bit double precision value
+   @param dst [out] 8-byte storage buffer
+   @ingroup FloatingPoint
+   @returns the raw bits used to represent the value 'd', in the form dst[0]=LSB
+ */
+void
+POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] )
+{
+   union
+   {
+      double d64;
+      posh_byte_t bytes[ 8 ];
+   } u;
+
+   u.d64 = d;
+
+#if defined POSH_LITTLE_ENDIAN
+   dst[ 0 ] = u.bytes[ 0 ];
+   dst[ 1 ] = u.bytes[ 1 ];
+   dst[ 2 ] = u.bytes[ 2 ];
+   dst[ 3 ] = u.bytes[ 3 ];
+   dst[ 4 ] = u.bytes[ 4 ];
+   dst[ 5 ] = u.bytes[ 5 ];
+   dst[ 6 ] = u.bytes[ 6 ];
+   dst[ 7 ] = u.bytes[ 7 ];
+#else
+   dst[ 0 ] = u.bytes[ 7 ];
+   dst[ 1 ] = u.bytes[ 6 ];
+   dst[ 2 ] = u.bytes[ 5 ];
+   dst[ 3 ] = u.bytes[ 4 ];
+   dst[ 4 ] = u.bytes[ 3 ];
+   dst[ 5 ] = u.bytes[ 2 ];
+   dst[ 6 ] = u.bytes[ 1 ];
+   dst[ 7 ] = u.bytes[ 0 ];
+#endif
+}
+
+/** 
+ * Creates a double-precision, 64-bit floating point value from a set of raw, 
+ * little-endian bits
+
+   @ingroup FloatingPoint
+   @param src [in] little-endian byte representation of 64-bit double precision 
+                  floating point value
+   @returns double precision floating point representation of the raw bits
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number, nor is there any check to ensure that src is 
+            non-NULL.  BE CAREFUL USING THIS.
+ */
+double
+POSH_DoubleFromBits( const posh_byte_t src[ 8 ] )
+{
+   union
+   {
+      double d64;
+      posh_byte_t bytes[ 8 ];
+   } u;
+
+#if defined POSH_LITTLE_ENDIAN
+   u.bytes[ 0 ] = src[ 0 ];
+   u.bytes[ 1 ] = src[ 1 ];
+   u.bytes[ 2 ] = src[ 2 ];
+   u.bytes[ 3 ] = src[ 3 ];
+   u.bytes[ 4 ] = src[ 4 ];
+   u.bytes[ 5 ] = src[ 5 ];
+   u.bytes[ 6 ] = src[ 6 ];
+   u.bytes[ 7 ] = src[ 7 ];
+#else
+   u.bytes[ 0 ] = src[ 7 ];
+   u.bytes[ 1 ] = src[ 6 ];
+   u.bytes[ 2 ] = src[ 5 ];
+   u.bytes[ 3 ] = src[ 4 ];
+   u.bytes[ 4 ] = src[ 3 ];
+   u.bytes[ 5 ] = src[ 2 ];
+   u.bytes[ 6 ] = src[ 1 ];
+   u.bytes[ 7 ] = src[ 0 ];
+#endif
+
+   return u.d64;
+}
+
+/** 
+ * Creates a floating point number from little endian bits
+ *
+   @ingroup FloatingPoint
+   @param   bits [in] raw floating point bits in little-endian form
+   @returns a floating point number based on the given bit representation
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number.  BE CAREFUL USING THIS.
+ */
+float       
+POSH_FloatFromLittleBits( posh_u32_t bits )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.u32 = bits;
+#if defined POSH_BIG_ENDIAN
+   u.u32 = POSH_SwapU32( u.u32 );
+#endif
+
+   return u.f32;
+}
+
+/** 
+ * Creates a floating point number from big-endian bits
+ *
+   @ingroup FloatingPoint
+   @param   bits [in] raw floating point bits in big-endian form
+   @returns a floating point number based on the given bit representation
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number.  BE CAREFUL USING THIS.
+ */
+float
+POSH_FloatFromBigBits( posh_u32_t bits )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.u32 = bits;
+#if defined POSH_LITTLE_ENDIAN
+   u.u32 = POSH_SwapU32( u.u32 );
+#endif
+
+   return u.f32;
+}
+
+#endif /* !defined POSH_NO_FLOAT */
diff --git a/thirdparty/thekla_atlas/poshlib/posh.h b/thirdparty/thekla_atlas/poshlib/posh.h
new file mode 100644
index 0000000000..c3efe26a2d
--- /dev/null
+++ b/thirdparty/thekla_atlas/poshlib/posh.h
@@ -0,0 +1,1030 @@
+/**
+@file posh.h
+@author Brian Hook
+@version 1.3.001
+
+Header file for POSH, the Portable Open Source Harness project.
+
+NOTE: Unlike most header files, this one is designed to be included
+multiple times, which is why it does not have the @#ifndef/@#define
+preamble.
+
+POSH relies on environment specified preprocessor symbols in order
+to infer as much as possible about the target OS/architecture and
+the host compiler capabilities.
+
+NOTE: POSH is simple and focused. It attempts to provide basic
+functionality and information, but it does NOT attempt to emulate
+missing functionality.  I am also not willing to make POSH dirty
+and hackish to support truly ancient and/or outmoded and/or bizarre
+technologies such as non-ANSI compilers, systems with non-IEEE
+floating point formats, segmented 16-bit operating systems, etc.
+
+Please refer to the accompanying HTML documentation or visit
+http://www.poshlib.org for more information on how to use POSH.
+
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REVISION:
+
+I've been lax about revision histories, so this starts at, um, 1.3.001.
+Sorry for any inconveniences.
+
+1.3.001 - 2/23/2006 - Incorporated fix for bug reported by Bill Cary,
+                      where I was not detecting Visual Studio
+                      compilation on x86-64 systems.  Added check for
+                      _M_X64 which should fix that.
+
+*/
+/*
+I have yet to find an authoritative reference on preprocessor
+symbols, but so far this is what I've gleaned:
+
+GNU GCC/G++:
+   - __GNUC__: GNU C version
+   - __GNUG__: GNU C++ compiler
+   - __sun__ : on Sun platforms
+   - __svr4__: on Solaris and other SysV R4 platforms
+   - __mips__: on MIPS processor platforms
+   - __sparc_v9__: on Sparc 64-bit CPUs
+   - __sparcv9: 64-bit Solaris
+   - __MIPSEL__: mips processor, compiled for little endian
+   - __MIPSEB__: mips processor, compiled for big endian
+   - _R5900: MIPS/Sony/Toshiba R5900 (PS2)
+   - mc68000: 68K
+   - m68000: 68K
+   - m68k: 68K
+   - __palmos__: PalmOS
+
+Intel C/C++ Compiler:
+   - __ECC      : compiler version, IA64 only
+   - __EDG__
+   - __ELF__
+   - __GXX_ABI_VERSION
+   - __i386     : IA-32 only
+   - __i386__   : IA-32 only
+   - i386       : IA-32 only
+   - __ia64     : IA-64 only
+   - __ia64__   : IA-64 only
+   - ia64       : IA-64 only
+   - __ICC      : IA-32 only
+   - __INTEL_COMPILER : IA-32 or IA-64, newer versions only
+
+Apple's C/C++ Compiler for OS X:
+   - __APPLE_CC__
+   - __APPLE__
+   - __BIG_ENDIAN__
+   - __APPLE__
+   - __ppc__
+   - __MACH__
+
+DJGPP:
+   - __MSDOS__
+   - __unix__
+   - __unix
+   - __GNUC__
+   - __GO32
+   - DJGPP
+   - __i386, __i386, i386
+
+Cray's C compiler:
+   - _ADDR64: if 64-bit pointers
+   - _UNICOS: 
+   - __unix:
+
+SGI's CC compiler predefines the following (and more) with -ansi:
+   - __sgi
+   - __unix
+   - __host_mips
+   - _SYSTYPE_SVR4
+   - __mips
+   - _MIPSEB
+   - anyone know if there is a predefined symbol for the compiler?!
+
+MinGW:
+   - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others
+   - __MINGW32__
+
+Cygwin:
+   - as Gnu C, but also
+   - __unix__
+   - __CYGWIN32__
+
+Microsoft Visual Studio predefines the following:
+   - _MSC_VER
+   - _WIN32: on Win32
+   - _M_IX6 (on x86 systems)
+   - _M_X64: on x86-64 systems
+   - _M_ALPHA (on DEC AXP systems)
+   - _SH3: WinCE, Hitachi SH-3
+   - _MIPS: WinCE, MIPS
+   - _ARM: WinCE, ARM
+
+Sun's C Compiler:
+   - sun and _sun
+   - unix and _unix
+   - sparc and _sparc (SPARC systems only)
+   - i386 and _i386 (x86 systems only)
+   - __SVR4 (Solaris only)
+   - __sparcv9: 64-bit solaris
+   - __SUNPRO_C
+   - _LP64: defined in 64-bit LP64 mode, but only if <sys/types.h> is included
+
+Borland C/C++ predefines the following:
+   - __BORLANDC__:
+
+DEC/Compaq C/C++ on Alpha:
+   - __alpha
+   - __arch64__
+   - __unix__ (on Tru64 Unix)
+   - __osf__
+   - __DECC
+   - __DECCXX (C++ compilation)
+   - __DECC_VER
+   - __DECCXX_VER
+
+IBM's AIX compiler:
+   - __64BIT__ if 64-bit mode
+   - _AIX
+   - __IBMC__: C compiler version
+   - __IBMCPP__: C++ compiler version
+   - _LONG_LONG: compiler allows long long
+
+Watcom:
+   - __WATCOMC__
+   - __DOS__ : if targeting DOS
+   - __386__ : if 32-bit support
+   - __WIN32__ : if targetin 32-bit Windows
+
+HP-UX C/C++ Compiler:
+   - __hpux
+   - __unix
+   - __hppa (on PA-RISC)
+   - __LP64__: if compiled in 64-bit mode
+
+Metrowerks:
+   - __MWERKS__
+   - __powerpc__
+   - _powerc
+   - __MC68K__
+   - macintosh when compiling for MacOS
+   - __INTEL__ for x86 targets
+   - __POWERPC__
+
+*/
+
+/*
+** ----------------------------------------------------------------------------
+** Include <limits.h> optionally
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_USE_LIMITS_H
+#  include <limits.h>
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine compilation environment
+** ----------------------------------------------------------------------------
+*/
+#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
+#  define POSH_COMPILER_STRING "Intel C/C++"
+#  define POSH_COMPILER_INTEL 1
+#endif
+
+#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__
+#  define POSH_COMPILER_STRING    "MIPSpro C/C++"
+#  define POSH_COMPILER_MIPSPRO 1 
+#endif
+
+#if defined __hpux && !defined __GNUC__
+#  define POSH_COMPILER_STRING "HP-UX CC"
+#  define POSH_COMPILER_HPCC 1 
+#endif
+
+#if defined __GNUC__ && !defined __clang__
+#  define POSH_COMPILER_STRING "Gnu GCC"
+#  define POSH_COMPILER_GCC 1
+#endif
+
+#if defined __clang__
+#  define POSH_COMPILER_STRING "Clang"
+#  define POSH_COMPILER_CLANG 1
+#endif
+
+#if defined __APPLE_CC__
+   /* we don't define the compiler string here, let it be GNU */
+#  define POSH_COMPILER_APPLECC 1
+#endif
+
+#if defined __IBMC__ || defined __IBMCPP__
+#  define POSH_COMPILER_STRING "IBM C/C++"
+#  define POSH_COMPILER_IBM 1
+#endif
+
+#if defined _MSC_VER
+#  define POSH_COMPILER_STRING "Microsoft Visual C++"
+#  define POSH_COMPILER_MSVC 1
+#endif
+
+#if defined __SUNPRO_C
+#  define POSH_COMPILER_STRING "Sun Pro" 
+#  define POSH_COMPILER_SUN 1
+#endif
+
+#if defined __BORLANDC__
+#  define POSH_COMPILER_STRING "Borland C/C++"
+#  define POSH_COMPILER_BORLAND 1
+#endif
+
+#if defined __MWERKS__
+#  define POSH_COMPILER_STRING     "MetroWerks CodeWarrior"
+#  define POSH_COMPILER_METROWERKS 1
+#endif
+
+#if defined __DECC || defined __DECCXX
+#  define POSH_COMPILER_STRING "Compaq/DEC C/C++"
+#  define POSH_COMPILER_DEC 1
+#endif
+
+#if defined __WATCOMC__
+#  define POSH_COMPILER_STRING "Watcom C/C++"
+#  define POSH_COMPILER_WATCOM 1
+#endif
+
+#if !defined POSH_COMPILER_STRING
+#  define POSH_COMPILER_STRING "Unknown compiler"
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine target operating system
+** ----------------------------------------------------------------------------
+*/
+#if defined linux || defined __linux__
+#  define POSH_OS_LINUX 1 
+#  define POSH_OS_STRING "Linux"
+#endif
+
+#if defined __FreeBSD__
+#  define POSH_OS_FREEBSD 1 
+#  define POSH_OS_STRING "FreeBSD"
+#endif
+
+#if defined __CYGWIN32__
+#  define POSH_OS_CYGWIN32 1
+#  define POSH_OS_STRING "Cygwin"
+#endif
+
+#if defined GEKKO
+#  define POSH_OS_GAMECUBE
+#  define __powerpc__
+#  define POSH_OS_STRING "GameCube"
+#endif
+
+#if defined __MINGW32__
+#  define POSH_OS_MINGW 1
+#  define POSH_OS_STRING "MinGW"
+#endif
+
+#if defined GO32 && defined DJGPP && defined __MSDOS__
+#  define POSH_OS_GO32 1
+#  define POSH_OS_STRING "GO32/MS-DOS"
+#endif
+
+/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS,
+   otherwise Watcom assumes host=target */
+#if defined __WATCOMC__  && defined __386__ && defined __DOS__
+#  define POSH_OS_DOS32 1
+#  define POSH_OS_STRING "DOS/32-bit"
+#endif
+
+#if defined _UNICOS
+#  define POSH_OS_UNICOS 1
+#  define POSH_OS_STRING "UNICOS"
+#endif
+
+//ACS if we're in xcode, look at the target conditionals to figure out if this is ios or osx
+#if defined __APPLE__
+#  include "TargetConditionals.h"
+#endif
+#if TARGET_OS_IPHONE
+#    define POSH_OS_IOS 1
+#    define POSH_OS_STRING "iOS"
+#else
+#  if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
+#    define POSH_OS_OSX 1
+#    define POSH_OS_STRING "MacOS X"
+#  endif
+#endif
+
+#if defined __sun__ || defined sun || defined __sun || defined __solaris__
+#  if defined __SVR4 || defined __svr4__ || defined __solaris__
+#     define POSH_OS_STRING "Solaris"
+#     define POSH_OS_SOLARIS 1
+#  endif
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "SunOS"
+#     define POSH_OS_SUNOS 1
+#  endif
+#endif
+
+#if defined __sgi__ || defined sgi || defined __sgi
+#  define POSH_OS_IRIX 1
+#  define POSH_OS_STRING "Irix"
+#endif
+
+#if defined __hpux__ || defined __hpux
+#  define POSH_OS_HPUX 1
+#  define POSH_OS_STRING "HP-UX"
+#endif
+
+#if defined _AIX
+#  define POSH_OS_AIX 1
+#  define POSH_OS_STRING "AIX"
+#endif
+
+#if ( defined __alpha && defined __osf__ )
+#  define POSH_OS_TRU64 1
+#  define POSH_OS_STRING "Tru64"
+#endif
+
+#if defined __BEOS__ || defined __beos__
+#  define POSH_OS_BEOS 1
+#  define POSH_OS_STRING "BeOS"
+#endif
+
+#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA
+#  define POSH_OS_AMIGA 1
+#  define POSH_OS_STRING "Amiga"
+#endif
+
+#if defined __unix__
+#  define POSH_OS_UNIX 1 
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "Unix-like(generic)"
+#  endif
+#endif
+
+#if defined _WIN32_WCE
+#  define POSH_OS_WINCE 1
+#  define POSH_OS_STRING "Windows CE"
+#endif
+
+#if defined _XBOX || defined _XBOX_VER
+#  define POSH_OS_XBOX 1
+#  define POSH_OS_STRING "XBOX"
+#endif
+
+#if defined __ORBIS__
+#   define POSH_OS_ORBIS
+#endif
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__
+#  if !defined POSH_OS_XBOX
+#  define POSH_OS_WIN32 1
+#     if defined _WIN64
+#        define POSH_OS_WIN64 1
+#        define POSH_OS_STRING "Win64"
+#     else
+#        if !defined POSH_OS_STRING
+#           define POSH_OS_STRING "Win32"
+#        endif
+#     endif
+#  endif
+#endif
+
+#if defined __palmos__
+#  define POSH_OS_PALM 1
+#  define POSH_OS_STRING "PalmOS"
+#endif
+
+#if defined THINK_C || defined macintosh
+#  define POSH_OS_MACOS 1
+#  define POSH_OS_STRING "MacOS"
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Determine target CPU
+** -----------------------------------------------------------------------------
+*/
+
+#if defined GEKKO
+#  define POSH_CPU_PPC750 1
+#  define POSH_CPU_STRING "IBM PowerPC 750 (NGC)"
+#endif
+
+#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000
+#  define POSH_CPU_68K 1
+#  define POSH_CPU_STRING "MC68000"
+#endif
+
+#if defined __PPC__ || defined __POWERPC__  || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__ || defined _M_PPC
+#  define POSH_CPU_PPC 1
+#  if !defined POSH_CPU_STRING
+#    if defined __powerpc64__
+#       define POSH_CPU_STRING "PowerPC64"
+#    else
+#       define POSH_CPU_STRING "PowerPC"
+#    endif
+#  endif
+#endif
+
+#if defined _CRAYT3E || defined _CRAYMPP
+#  define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/
+#  define POSH_CPU_STRING "Cray T3E (Alpha 21164)"
+#endif
+
+#if defined CRAY || defined _CRAY && !defined _CRAYT3E
+#  error Non-AXP Cray systems not supported
+#endif
+
+#if defined _SH3
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_STRING "Hitachi SH-3"
+#endif
+
+#if defined __sh4__ || defined __SH4__
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_SH4 1
+#  define POSH_CPU_STRING "Hitachi SH-4"
+#endif
+
+#if defined __sparc__ || defined __sparc
+#  if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__
+#     define POSH_CPU_SPARC64 1 
+#     define POSH_CPU_STRING "Sparc/64"
+#  else
+#     define POSH_CPU_STRING "Sparc/32"
+#  endif
+#  define POSH_CPU_SPARC 1
+#endif
+
+#if defined ARM || defined __arm__ || defined _ARM
+#  define POSH_CPU_STRONGARM 1
+#  define POSH_CPU_STRING "ARM"
+#endif
+
+#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
+#  define POSH_CPU_MIPS 1 
+#  if defined _R5900
+#    define POSH_CPU_STRING "MIPS R5900 (PS2)"
+#  else
+#    define POSH_CPU_STRING "MIPS"
+#  endif
+#endif
+
+#if defined __ia64 || defined _M_IA64 || defined __ia64__ 
+#  define POSH_CPU_IA64 1
+#  define POSH_CPU_STRING "IA64"
+#endif
+
+#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64
+#  define POSH_CPU_X86 1
+#  if defined __x86_64__ || defined _M_X64
+#     define POSH_CPU_X86_64 1 
+#  endif
+#  if defined POSH_CPU_X86_64
+#     define POSH_CPU_STRING "AMD x86-64"
+#  else
+#     define POSH_CPU_STRING "Intel 386+"
+#  endif
+#endif
+
+#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__
+#  define POSH_CPU_AXP 1
+#  define POSH_CPU_STRING "AXP"
+#endif
+
+#if defined __hppa || defined hppa
+#  define POSH_CPU_HPPA 1
+#  define POSH_CPU_STRING "PA-RISC"
+#endif
+
+#if !defined POSH_CPU_STRING
+#  error POSH cannot determine target CPU
+#  define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Attempt to autodetect building for embedded on Sony PS2
+** -----------------------------------------------------------------------------
+*/
+#if !defined POSH_OS_STRING
+#  if !defined FORCE_DOXYGEN
+#    define POSH_OS_EMBEDDED 1 
+#  endif
+#  if defined _R5900
+#     define POSH_OS_STRING "Sony PS2(embedded)"
+#  else
+#     define POSH_OS_STRING "Embedded/Unknown"
+#  endif
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Handle cdecl, stdcall, fastcall, etc.
+** ---------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64
+#  if defined __GNUC__
+#     define POSH_CDECL __attribute__((cdecl))
+#     define POSH_STDCALL __attribute__((stdcall))
+#     define POSH_FASTCALL __attribute__((fastcall))
+#  elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ )
+#     define POSH_CDECL    __cdecl
+#     define POSH_STDCALL  __stdcall
+#     define POSH_FASTCALL __fastcall
+#  endif
+#else
+#  define POSH_CDECL    
+#  define POSH_STDCALL  
+#  define POSH_FASTCALL 
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB
+** ---------------------------------------------------------------------------
+*/
+
+/*
+** We undefine this so that multiple inclusions will work
+*/
+#if defined POSH_IMPORTEXPORT
+#  undef POSH_IMPORTEXPORT
+#endif
+
+#if defined POSH_DLL
+#   if defined POSH_OS_WIN32
+#      if defined _MSC_VER 
+#         if ( _MSC_VER >= 800 )
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif  /* defined _MSC_VER */
+#      if defined __BORLANDC__
+#         if ( __BORLANDC__ >= 0x500 )
+#            if defined POSH_BUILDING_LIB 
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif /* defined __BORLANDC__ */
+       /* for all other compilers, we're just making a blanket assumption */
+#      if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__
+#         if defined POSH_BUILDING_LIB
+#            define POSH_IMPORTEXPORT __declspec( dllexport )
+#         else
+#            define POSH_IMPORTEXPORT __declspec( dllimport )
+#         endif
+#      endif /* all other compilers */
+#      if !defined POSH_IMPORTEXPORT
+#         error Building DLLs not supported on this compiler (poshlib@poshlib.org if you know how)
+#      endif
+#   endif /* defined POSH_OS_WIN32 */
+#endif
+
+/* On pretty much everything else, we can thankfully just ignore this */
+#if !defined POSH_IMPORTEXPORT
+#  define POSH_IMPORTEXPORT
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_DLL    
+#  define POSH_BUILDING_LIB
+#  undef POSH_DLL
+#  undef POSH_BUILDING_LIB
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** (Re)define POSH_PUBLIC_API export signature 
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_PUBLIC_API
+#  undef POSH_PUBLIC_API
+#endif
+
+#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) )
+#  define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT 
+#else
+#  define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Try to infer endianess.  Basically we just go through the CPUs we know are
+** little endian, and assume anything that isn't one of those is big endian.
+** As a sanity check, we also do this with operating systems we know are
+** little endian, such as Windows.  Some processors are bi-endian, such as 
+** the MIPS series, so we have to be careful about those.
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
+#  define POSH_ENDIAN_STRING "little"
+#  define POSH_LITTLE_ENDIAN 1
+#else
+#  define POSH_ENDIAN_STRING "big"
+#  define POSH_BIG_ENDIAN 1
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_LITTLE_ENDIAN
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Cross-platform compile time assertion macro
+** ----------------------------------------------------------------------------
+*/
+#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ]
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit Integer
+**
+** We don't require 64-bit support, nor do we emulate its functionality, we
+** simply export it if it's available.  Since we can't count on <limits.h>
+** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive.
+** ----------------------------------------------------------------------------
+*/
+#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64
+#  define POSH_64BIT_INTEGER 1
+typedef long posh_i64_t; 
+typedef unsigned long posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)x)
+#  define POSH_U64( x ) ((posh_u64_t)x)
+#  define POSH_I64_PRINTF_PREFIX "l"
+#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC )
+#  define POSH_64BIT_INTEGER 1
+typedef __int64 posh_i64_t;
+typedef unsigned __int64 posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)x)
+#  define POSH_U64( x ) ((posh_u64_t)x)
+#  define POSH_I64_PRINTF_PREFIX "I64"
+#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC
+#  define POSH_64BIT_INTEGER 1
+typedef long long posh_i64_t;
+typedef unsigned long long posh_u64_t;
+#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
+#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
+#  define POSH_I64_PRINTF_PREFIX "ll"
+#endif
+
+/* hack */
+/*#ifdef __MINGW32__
+#undef POSH_I64
+#undef POSH_U64
+#undef POSH_I64_PRINTF_PREFIX
+#define POSH_I64( x ) ((posh_i64_t)x)
+#define POSH_U64( x ) ((posh_u64_t)x)
+#define POSH_I64_PRINTF_PREFIX "I64"
+#endif*/
+
+#ifdef FORCE_DOXYGEN
+typedef long long posh_i64_t;
+typedef unsigned long posh_u64_t;
+#  define POSH_64BIT_INTEGER
+#  define POSH_I64_PRINTF_PREFIX
+#  define POSH_I64(x)
+#  define POSH_U64(x)
+#endif
+
+/** Minimum value for a 64-bit signed integer */
+#define POSH_I64_MIN  POSH_I64(0x8000000000000000)
+/** Maximum value for a 64-bit signed integer */
+#define POSH_I64_MAX  POSH_I64(0x7FFFFFFFFFFFFFFF)
+/** Minimum value for a 64-bit unsigned integer */
+#define POSH_U64_MIN  POSH_U64(0)
+/** Maximum value for a 64-bit unsigned integer */
+#define POSH_U64_MAX  POSH_U64(0xFFFFFFFFFFFFFFFF)
+
+/* ----------------------------------------------------------------------------
+** Basic Sized Types
+**
+** These types are expected to be EXACTLY sized so you can use them for
+** serialization.
+** ----------------------------------------------------------------------------
+*/
+#define POSH_FALSE 0 
+#define POSH_TRUE  1 
+
+typedef int            posh_bool_t;
+typedef unsigned char  posh_byte_t;
+
+/* NOTE: These assume that CHAR_BIT is 8!! */
+typedef unsigned char  posh_u8_t;
+typedef signed char    posh_i8_t;
+
+#if defined POSH_USE_LIMITS_H
+#  if CHAR_BITS > 8
+#    error This machine uses 9-bit characters.  This is a warning, you can comment this out now.
+#  endif /* CHAR_BITS > 8 */
+
+/* 16-bit */
+#  if ( USHRT_MAX == 65535 ) 
+   typedef unsigned short posh_u16_t;
+   typedef short          posh_i16_t;
+#  else
+   /* Yes, in theory there could still be a 16-bit character type and shorts are
+      32-bits in size...if you find such an architecture, let me know =P */
+#    error No 16-bit type found
+#  endif
+
+/* 32-bit */
+#  if ( INT_MAX == 2147483647 )
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  elif ( LONG_MAX == 2147483647 )
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  else
+      error No 32-bit type found
+#  endif
+
+#else /* POSH_USE_LIMITS_H */
+
+  typedef unsigned short posh_u16_t;
+  typedef short          posh_i16_t;
+
+#  if !defined POSH_OS_PALM
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  else
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  endif
+#endif
+
+/** Minimum value for a byte */
+#define POSH_BYTE_MIN    0
+/** Maximum value for an 8-bit unsigned value */
+#define POSH_BYTE_MAX    255
+/** Minimum value for a byte */
+#define POSH_I16_MIN     ( ( posh_i16_t ) 0x8000 )
+/** Maximum value for a 16-bit signed value */
+#define POSH_I16_MAX     ( ( posh_i16_t ) 0x7FFF ) 
+/** Minimum value for a 16-bit unsigned value */
+#define POSH_U16_MIN     0
+/** Maximum value for a 16-bit unsigned value */
+#define POSH_U16_MAX     ( ( posh_u16_t ) 0xFFFF )
+/** Minimum value for a 32-bit signed value */
+#define POSH_I32_MIN     ( ( posh_i32_t ) 0x80000000 )
+/** Maximum value for a 32-bit signed value */
+#define POSH_I32_MAX     ( ( posh_i32_t ) 0x7FFFFFFF )
+/** Minimum value for a 32-bit unsigned value */
+#define POSH_U32_MIN     0
+/** Maximum value for a 32-bit unsigned value */
+#define POSH_U32_MAX     ( ( posh_u32_t ) 0xFFFFFFFF )
+
+/*
+** ----------------------------------------------------------------------------
+** Sanity checks on expected sizes
+** ----------------------------------------------------------------------------
+*/
+#if !defined FORCE_DOXYGEN
+
+POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4);
+POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4);
+
+#if !defined POSH_NO_FLOAT
+   POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 );
+   POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8);
+#endif
+
+#if defined POSH_64BIT_INTEGER
+   POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8);
+   POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8);
+#endif
+
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit pointer support
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX )
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC
+#   define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_64BIT_POINTER
+   POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 );
+#elif !defined FORCE_DOXYGEN
+/* if this assertion is hit then you're on a system that either has 64-bit
+   addressing and we didn't catch it, or you're on a system with 16-bit
+   pointers.  In the latter case, POSH doesn't actually care, we're just
+   triggering this assertion to make sure you're aware of the situation,
+   so feel free to delete it.
+
+   If this assertion is triggered on a known 32 or 64-bit platform, 
+   please let us know (poshlib@poshlib.org) */
+   POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 );
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_64BIT_POINTER
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** POSH Utility Functions
+**
+** These are optional POSH utility functions that are not required if you don't
+** need anything except static checking of your host and target environment.
+** 
+** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want
+** to enforce their export if your own library is only using them internally.
+** ----------------------------------------------------------------------------
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const char *POSH_GetArchString( void );
+
+#if !defined POSH_NO_FLOAT
+
+posh_u32_t  POSH_LittleFloatBits( float f );
+posh_u32_t  POSH_BigFloatBits( float f );
+float       POSH_FloatFromLittleBits( posh_u32_t bits );
+float       POSH_FloatFromBigBits( posh_u32_t bits );
+
+void        POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] );
+double      POSH_DoubleFromBits( const posh_byte_t src[ 8 ] );
+
+/* unimplemented
+float      *POSH_WriteFloatToLittle( void *dst, float f );
+float      *POSH_WriteFloatToBig( void *dst, float f );
+float       POSH_ReadFloatFromLittle( const void *src );
+float       POSH_ReadFloatFromBig( const void *src );
+
+double     *POSH_WriteDoubleToLittle( void *dst, double d );
+double     *POSH_WriteDoubleToBig( void *dst, double d );
+double      POSH_ReadDoubleFromLittle( const void *src );
+double      POSH_ReadDoubleFromBig( const void *src );
+*/
+#endif /* !defined POSH_NO_FLOAT */
+
+#if defined FORCE_DOXYGEN
+#  define POSH_NO_FLOAT
+#  undef  POSH_NO_FLOAT
+#endif
+
+extern posh_u16_t  POSH_SwapU16( posh_u16_t u );
+extern posh_i16_t  POSH_SwapI16( posh_i16_t u );
+extern posh_u32_t  POSH_SwapU32( posh_u32_t u );
+extern posh_i32_t  POSH_SwapI32( posh_i32_t u );
+
+#if defined POSH_64BIT_INTEGER
+
+extern posh_u64_t  POSH_SwapU64( posh_u64_t u );
+extern posh_i64_t  POSH_SwapI64( posh_i64_t u );
+
+#endif /*POSH_64BIT_INTEGER */
+
+extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value );
+
+extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value );
+
+extern posh_u16_t  POSH_ReadU16FromLittle( const void *src );
+extern posh_i16_t  POSH_ReadI16FromLittle( const void *src );
+extern posh_u32_t  POSH_ReadU32FromLittle( const void *src );
+extern posh_i32_t  POSH_ReadI32FromLittle( const void *src );
+
+extern posh_u16_t  POSH_ReadU16FromBig( const void *src );
+extern posh_i16_t  POSH_ReadI16FromBig( const void *src );
+extern posh_u32_t  POSH_ReadU32FromBig( const void *src );
+extern posh_i32_t  POSH_ReadI32FromBig( const void *src );
+
+#if defined POSH_64BIT_INTEGER
+extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value );
+extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value );
+
+extern posh_u64_t  POSH_ReadU64FromLittle( const void *src );
+extern posh_i64_t  POSH_ReadI64FromLittle( const void *src );
+extern posh_u64_t  POSH_ReadU64FromBig( const void *src );
+extern posh_i64_t  POSH_ReadI64FromBig( const void *src );
+#endif /* POSH_64BIT_INTEGER */
+
+#if defined POSH_LITTLE_ENDIAN
+
+#  define POSH_LittleU16(x) (x)
+#  define POSH_LittleU32(x) (x)
+#  define POSH_LittleI16(x) (x)
+#  define POSH_LittleI32(x) (x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) (x)
+#    define POSH_LittleI64(x) (x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#  define POSH_BigU16(x) POSH_SwapU16(x)
+#  define POSH_BigU32(x) POSH_SwapU32(x)
+#  define POSH_BigI16(x) POSH_SwapI16(x)
+#  define POSH_BigI32(x) POSH_SwapI32(x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) POSH_SwapU64(x)
+#    define POSH_BigI64(x) POSH_SwapI64(x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#else
+
+#  define POSH_BigU16(x) (x)
+#  define POSH_BigU32(x) (x)
+#  define POSH_BigI16(x) (x)
+#  define POSH_BigI32(x) (x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) (x)
+#    define POSH_BigI64(x) (x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#  define POSH_LittleU16(x) POSH_SwapU16(x)
+#  define POSH_LittleU32(x) POSH_SwapU32(x)
+#  define POSH_LittleI16(x) POSH_SwapI16(x)
+#  define POSH_LittleI32(x) POSH_SwapI32(x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) POSH_SwapU64(x)
+#    define POSH_LittleI64(x) POSH_SwapI64(x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+
diff --git a/thirdparty/thekla_atlas/thekla/thekla_atlas.cpp b/thirdparty/thekla_atlas/thekla/thekla_atlas.cpp
new file mode 100644
index 0000000000..d6f0accf54
--- /dev/null
+++ b/thirdparty/thekla_atlas/thekla/thekla_atlas.cpp
@@ -0,0 +1,271 @@
+
+#include "thekla_atlas.h"
+
+#include <cfloat>
+
+#include "nvmesh/halfedge/Edge.h"
+#include "nvmesh/halfedge/Mesh.h"
+#include "nvmesh/halfedge/Face.h"
+#include "nvmesh/halfedge/Vertex.h"
+#include "nvmesh/param/Atlas.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
+
+#include "nvcore/Array.inl"
+
+
+using namespace Thekla;
+using namespace nv;
+
+
+inline Atlas_Output_Mesh * set_error(Atlas_Error * error, Atlas_Error code) {
+    if (error) *error = code;
+    return NULL;
+}
+
+
+
+static void input_to_mesh(const Atlas_Input_Mesh * input, HalfEdge::Mesh * mesh, Atlas_Error * error) {
+
+    Array<uint> canonicalMap;
+    canonicalMap.reserve(input->vertex_count);
+
+    for (int i = 0; i < input->vertex_count; i++) {
+        const Atlas_Input_Vertex & input_vertex = input->vertex_array[i];
+        const float * pos = input_vertex.position;
+        const float * nor = input_vertex.normal;
+        const float * tex = input_vertex.uv;
+
+        HalfEdge::Vertex * vertex = mesh->addVertex(Vector3(pos[0], pos[1], pos[2]));
+        vertex->nor.set(nor[0], nor[1], nor[2]);
+        vertex->tex.set(tex[0], tex[1]);
+
+        canonicalMap.append(input_vertex.first_colocal);
+    }
+
+    mesh->linkColocalsWithCanonicalMap(canonicalMap);
+
+
+    const int face_count = input->face_count;
+
+    int non_manifold_faces = 0;
+    for (int i = 0; i < face_count; i++) {
+        const Atlas_Input_Face & input_face = input->face_array[i];
+
+        int v0 = input_face.vertex_index[0];
+        int v1 = input_face.vertex_index[1];
+        int v2 = input_face.vertex_index[2];
+
+        HalfEdge::Face * face = mesh->addFace(v0, v1, v2);
+        if (face != NULL) {
+            face->material = input_face.material_index;
+        }
+        else {
+            non_manifold_faces++;
+        }
+    }
+
+    mesh->linkBoundary();
+
+    if (non_manifold_faces != 0 && error != NULL) {
+        *error = Atlas_Error_Invalid_Mesh_Non_Manifold;
+    }
+}
+
+static Atlas_Output_Mesh * mesh_atlas_to_output(const HalfEdge::Mesh * mesh, const Atlas & atlas, Atlas_Error * error) {
+
+    Atlas_Output_Mesh * output = new Atlas_Output_Mesh;
+
+    const MeshCharts * charts = atlas.meshAt(0);
+
+    // Allocate vertices.
+    const int vertex_count = charts->vertexCount();
+    output->vertex_count = vertex_count;
+    output->vertex_array = new Atlas_Output_Vertex[vertex_count];
+
+    int w = 0;
+    int h = 0;
+
+    // Output vertices.
+    const int chart_count = charts->chartCount();
+    for (int i = 0; i < chart_count; i++) {
+        const Chart * chart = charts->chartAt(i);
+        uint vertexOffset = charts->vertexCountBeforeChartAt(i);
+
+        const uint chart_vertex_count = chart->vertexCount();
+        for (uint v = 0; v < chart_vertex_count; v++) {
+            Atlas_Output_Vertex & output_vertex = output->vertex_array[vertexOffset + v]; 
+
+            uint original_vertex = chart->mapChartVertexToOriginalVertex(v);
+            output_vertex.xref = original_vertex;
+
+            Vector2 uv = chart->chartMesh()->vertexAt(v)->tex;
+            output_vertex.uv[0] = uv.x;
+            output_vertex.uv[1] = uv.y;
+            w = max(w, ftoi_ceil(uv.x));
+            h = max(h, ftoi_ceil(uv.y));
+        }
+    }
+
+    const int face_count = mesh->faceCount();
+    output->index_count = face_count * 3;
+    output->index_array = new int[face_count * 3];
+
+    // Set face indices.
+    for (int f = 0; f < face_count; f++) {
+        uint c = charts->faceChartAt(f);
+        uint i = charts->faceIndexWithinChartAt(f);
+        uint vertexOffset = charts->vertexCountBeforeChartAt(c);
+
+        const Chart * chart = charts->chartAt(c);
+        nvDebugCheck(chart->faceAt(i) == f);
+
+        const HalfEdge::Face * face = chart->chartMesh()->faceAt(i);
+        const HalfEdge::Edge * edge = face->edge;
+
+        output->index_array[3*f+0] = vertexOffset + edge->vertex->id;
+        output->index_array[3*f+1] = vertexOffset + edge->next->vertex->id;
+        output->index_array[3*f+2] = vertexOffset + edge->next->next->vertex->id;
+    }
+
+    *error = Atlas_Error_Success;
+    output->atlas_width = w;
+    output->atlas_height = h;
+
+    return output;
+}
+
+
+void Thekla::atlas_set_default_options(Atlas_Options * options) {
+    if (options != NULL) {
+        // These are the default values we use on The Witness.
+
+        options->charter = Atlas_Charter_Default;
+        options->charter_options.witness.proxy_fit_metric_weight = 2.0f;
+        options->charter_options.witness.roundness_metric_weight = 0.01f;
+        options->charter_options.witness.straightness_metric_weight = 6.0f;
+        options->charter_options.witness.normal_seam_metric_weight = 4.0f;
+        options->charter_options.witness.texture_seam_metric_weight = 0.5f;
+        options->charter_options.witness.max_chart_area = FLT_MAX;
+        options->charter_options.witness.max_boundary_length = FLT_MAX;
+
+        options->mapper = Atlas_Mapper_Default;
+
+        options->packer = Atlas_Packer_Default;
+        options->packer_options.witness.packing_quality = 0;
+        options->packer_options.witness.texel_area = 8;
+        options->packer_options.witness.block_align = true;
+        options->packer_options.witness.conservative = false;
+    }
+}
+
+
+Atlas_Output_Mesh * Thekla::atlas_generate(const Atlas_Input_Mesh * input, const Atlas_Options * options, Atlas_Error * error) {
+    // Validate args.
+    if (input == NULL || options == NULL || error == NULL) return set_error(error, Atlas_Error_Invalid_Args);
+
+    // Validate options.
+    if (options->charter != Atlas_Charter_Witness) {
+        return set_error(error, Atlas_Error_Invalid_Options);
+    }
+    if (options->charter == Atlas_Charter_Witness) {
+        // @@ Validate input options!
+    }
+
+    if (options->mapper != Atlas_Mapper_LSCM) {
+        return set_error(error, Atlas_Error_Invalid_Options);
+    }
+    if (options->mapper == Atlas_Mapper_LSCM) {
+        // No options.
+    }
+
+    if (options->packer != Atlas_Packer_Witness) {
+        return set_error(error, Atlas_Error_Invalid_Options);
+    }
+    if (options->packer == Atlas_Packer_Witness) {
+        // @@ Validate input options!
+    }
+
+    // Validate input mesh.
+    for (int i = 0; i < input->face_count; i++) {
+        int v0 = input->face_array[i].vertex_index[0];
+        int v1 = input->face_array[i].vertex_index[1];
+        int v2 = input->face_array[i].vertex_index[2];
+
+        if (v0 < 0 || v0 >= input->vertex_count || 
+            v1 < 0 || v1 >= input->vertex_count || 
+            v2 < 0 || v2 >= input->vertex_count)
+        {
+            return set_error(error, Atlas_Error_Invalid_Mesh);
+        }
+    }
+
+
+    // Build half edge mesh.
+    AutoPtr<HalfEdge::Mesh> mesh(new HalfEdge::Mesh);
+
+    input_to_mesh(input, mesh.ptr(), error);
+
+    if (*error == Atlas_Error_Invalid_Mesh) {
+        return NULL;
+    }
+
+    Atlas atlas;
+
+    // Charter.
+    if (options->charter == Atlas_Charter_Extract) {
+        return set_error(error, Atlas_Error_Not_Implemented);
+    }
+    else if (options->charter == Atlas_Charter_Witness) {
+        SegmentationSettings segmentation_settings;
+        segmentation_settings.proxyFitMetricWeight = options->charter_options.witness.proxy_fit_metric_weight;
+        segmentation_settings.roundnessMetricWeight = options->charter_options.witness.roundness_metric_weight;
+        segmentation_settings.straightnessMetricWeight = options->charter_options.witness.straightness_metric_weight;
+        segmentation_settings.normalSeamMetricWeight = options->charter_options.witness.normal_seam_metric_weight;
+        segmentation_settings.textureSeamMetricWeight = options->charter_options.witness.texture_seam_metric_weight;
+        segmentation_settings.maxChartArea = options->charter_options.witness.max_chart_area;
+        segmentation_settings.maxBoundaryLength = options->charter_options.witness.max_boundary_length;
+
+        Array<uint> uncharted_materials;
+        atlas.computeCharts(mesh.ptr(), segmentation_settings, uncharted_materials);
+    }
+    
+    if (atlas.hasFailed())
+        return NULL;
+
+    // Mapper.
+    if (options->mapper == Atlas_Mapper_LSCM) {
+        atlas.parameterizeCharts();
+    }
+
+    if (atlas.hasFailed())
+        return NULL;
+
+    // Packer.
+    if (options->packer == Atlas_Packer_Witness) {
+        int packing_quality = options->packer_options.witness.packing_quality;
+        float texel_area = options->packer_options.witness.texel_area;
+        int block_align = options->packer_options.witness.block_align;
+        int conservative = options->packer_options.witness.conservative;
+
+        /*float utilization =*/ atlas.packCharts(packing_quality, texel_area, block_align, conservative);
+    }
+    
+    if (atlas.hasFailed())
+        return NULL;
+
+
+    // Build output mesh.
+    return mesh_atlas_to_output(mesh.ptr(), atlas, error);
+}
+
+
+void Thekla::atlas_free(Atlas_Output_Mesh * output) {
+    if (output != NULL) {
+        delete [] output->vertex_array;
+        delete [] output->index_array;
+        delete output;
+    }
+}
+
diff --git a/thirdparty/thekla_atlas/thekla/thekla_atlas.h b/thirdparty/thekla_atlas/thekla/thekla_atlas.h
new file mode 100644
index 0000000000..1d0716e781
--- /dev/null
+++ b/thirdparty/thekla_atlas/thekla/thekla_atlas.h
@@ -0,0 +1,116 @@
+
+// Thekla Atlas Generator
+
+namespace Thekla {
+
+enum Atlas_Charter {
+    Atlas_Charter_Witness,  // Options: threshold
+    Atlas_Charter_Extract,  // Options: ---
+    Atlas_Charter_Default = Atlas_Charter_Witness
+};
+
+enum Atlas_Mapper {
+    Atlas_Mapper_LSCM,      // Options: ---
+    Atlas_Mapper_Default = Atlas_Mapper_LSCM
+};
+
+enum Atlas_Packer {
+    Atlas_Packer_Witness,   // Options: texel_area
+    Atlas_Packer_Default = Atlas_Packer_Witness
+};
+
+struct Atlas_Options {
+    Atlas_Charter charter;
+    union {
+        struct {
+            float proxy_fit_metric_weight;
+            float roundness_metric_weight;
+            float straightness_metric_weight;
+            float normal_seam_metric_weight;
+            float texture_seam_metric_weight;
+            float max_chart_area;
+            float max_boundary_length;
+        } witness;
+        struct {
+        } extract;
+    } charter_options;
+
+    Atlas_Mapper mapper;
+    union {
+    } mapper_options;
+
+    Atlas_Packer packer;
+    union {
+        struct {
+            int packing_quality;
+            float texel_area;       // This is not really texel area, but 1 / texel width?
+            bool block_align;       // Align charts to 4x4 blocks. 
+            bool conservative;      // Pack charts with extra padding.
+        } witness;
+    } packer_options;
+};
+
+struct Atlas_Input_Vertex {
+    float position[3];
+    float normal[3];
+    float uv[2];
+    int first_colocal;
+};
+
+struct Atlas_Input_Face {
+    int vertex_index[3];
+    int material_index;
+};
+
+struct Atlas_Input_Mesh {
+    int vertex_count;
+    int face_count;
+    Atlas_Input_Vertex * vertex_array;
+    Atlas_Input_Face * face_array;
+};
+
+struct Atlas_Output_Vertex {
+    float uv[2];
+    int xref;   // Index of input vertex from which this output vertex originated.
+};
+
+struct Atlas_Output_Mesh {
+    int atlas_width;
+    int atlas_height;
+    int vertex_count;
+    int index_count;
+    Atlas_Output_Vertex * vertex_array;
+    int * index_array;
+};
+
+enum Atlas_Error {
+    Atlas_Error_Success,
+    Atlas_Error_Invalid_Args,
+    Atlas_Error_Invalid_Options,
+    Atlas_Error_Invalid_Mesh,
+    Atlas_Error_Invalid_Mesh_Non_Manifold,
+    Atlas_Error_Not_Implemented,
+};
+
+void atlas_set_default_options(Atlas_Options * options);
+
+Atlas_Output_Mesh * atlas_generate(const Atlas_Input_Mesh * input, const Atlas_Options * options, Atlas_Error * error);
+
+void atlas_free(Atlas_Output_Mesh * output);
+
+
+/*
+
+Should we represent the input mesh with an opaque structure that simply holds pointers to the user data? That would allow us to avoid having to copy attributes to an intermediate representation.
+
+struct Atlas_Input_Mesh;
+
+void mesh_set_vertex_position(Atlas_Input_Mesh * mesh, float * ptr, int stride);
+void mesh_set_vertex_normal(Atlas_Input_Mesh * mesh, float * ptr, int stride);
+void mesh_set_vertex_uv(Mesh * mesh, float * ptr, int stride);
+
+void mesh_set_index(Mesh * mesh, int * ptr);
+*/
+
+} // Thekla namespace
+
diff --git a/thirdparty/tinyexr/tinyexr.h b/thirdparty/tinyexr/tinyexr.h
index c82768be9a..606c19756a 100644
--- a/thirdparty/tinyexr/tinyexr.h
+++ b/thirdparty/tinyexr/tinyexr.h
@@ -85,11 +85,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stddef.h>  // for size_t
 #include <stdint.h>  // guess stdint.h is available(C99)
 
-// -- GODOT change for old MinGW on Travis CI --
-#if defined(__MINGW32__)
-#include <_mingw.h>  // for __MINGW64_VERSION_MAJOR
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -264,7 +259,8 @@ typedef struct _DeepImage {
 } DeepImage;
 
 // @deprecated { to be removed. }
-// Loads single-frame OpenEXR image. Assume EXR image contains RGB(A) channels.
+// Loads single-frame OpenEXR image. Assume EXR image contains A(single channel
+// alpha) or RGB(A) channels.
 // Application must free image data as returned by `out_rgba`
 // Result image format is: float x RGBA x width x hight
 // Returns negative value and may set error string in `err` when there's an
@@ -274,9 +270,14 @@ extern int LoadEXR(float **out_rgba, int *width, int *height,
 
 // @deprecated { to be removed. }
 // Saves single-frame OpenEXR image. Assume EXR image contains RGB(A) channels.
-// components must be 3(RGB) or 4(RGBA).
-// Result image format is: float x RGB(A) x width x hight
-extern int SaveEXR(const float *data, int width, int height, int components,
+// components must be 1(Grayscale), 3(RGB) or 4(RGBA).
+// Input image format is: `float x width x height`, or `float x RGB(A) x width x
+// hight`
+// Save image as fp16(HALF) format when `save_as_fp16` is positive non-zero
+// value.
+// Save image as fp32(FLOAT) format when `save_as_fp16` is 0.
+extern int SaveEXR(const float *data, const int width, const int height,
+                   const int components, const int save_as_fp16,
                    const char *filename);
 
 // Initialize EXRHeader struct
@@ -406,12 +407,11 @@ extern int LoadDeepEXR(DeepImage *out_image, const char *filename,
 // For emscripten.
 // Loads single-frame OpenEXR image from memory. Assume EXR image contains
 // RGB(A) channels.
-// `out_rgba` must have enough memory(at least sizeof(float) x 4(RGBA) x width x
-// hight)
 // Returns negative value and may set error string in `err` when there's an
 // error
-extern int LoadEXRFromMemory(float *out_rgba, const unsigned char *memory,
-                             size_t size, const char **err);
+extern int LoadEXRFromMemory(float **out_rgba, int *width, int *height,
+							 const unsigned char *memory, size_t size,
+							 const char **err);
 
 #ifdef __cplusplus
 }
@@ -444,7 +444,8 @@ extern int LoadEXRFromMemory(float *out_rgba, const unsigned char *memory,
 
 #if TINYEXR_USE_MINIZ
 #else
-#include "zlib.h"
+//  Issue #46. Please include your own zlib-compatible API header before including `tinyexr.h`
+//#include "zlib.h"
 #endif
 
 #if TINYEXR_USE_ZFP
@@ -483,13 +484,11 @@ namespace miniz {
 #pragma clang diagnostic ignored "-Wsign-conversion"
 #pragma clang diagnostic ignored "-Wc++11-extensions"
 #pragma clang diagnostic ignored "-Wconversion"
-#ifdef __APPLE__
-#if __clang_major__ >= 8 && __clang__minor__ > 1
+#pragma clang diagnostic ignored "-Wunused-function"
+#if __has_warning("-Wcomma")
 #pragma clang diagnostic ignored "-Wcomma"
 #endif
 #endif
-#pragma clang diagnostic ignored "-Wunused-function"
-#endif
 
 /* miniz.c v1.15 - public domain deflate/inflate, zlib-subset, ZIP
    reading/writing/appending, PNG writing
@@ -1918,11 +1917,11 @@ static void def_free_func(void *opaque, void *address) {
   (void)opaque, (void)address;
   MZ_FREE(address);
 }
-static void *def_realloc_func(void *opaque, void *address, size_t items,
-                              size_t size) {
-  (void)opaque, (void)address, (void)items, (void)size;
-  return MZ_REALLOC(address, items * size);
-}
+// static void *def_realloc_func(void *opaque, void *address, size_t items,
+//                              size_t size) {
+//  (void)opaque, (void)address, (void)items, (void)size;
+//  return MZ_REALLOC(address, items * size);
+//}
 
 const char *mz_version(void) { return MZ_VERSION; }
 
@@ -2894,8 +2893,9 @@ void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len,
     tinfl_status status = tinfl_decompress(
         &decomp, (const mz_uint8 *)pSrc_buf + src_buf_ofs, &src_buf_size,
         (mz_uint8 *)pBuf, pBuf ? (mz_uint8 *)pBuf + *pOut_len : NULL,
-        &dst_buf_size, (flags & ~TINFL_FLAG_HAS_MORE_INPUT) |
-                           TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+        &dst_buf_size,
+        (flags & ~TINFL_FLAG_HAS_MORE_INPUT) |
+            TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
     if ((status < 0) || (status == TINFL_STATUS_NEEDS_MORE_INPUT)) {
       MZ_FREE(pBuf);
       *pOut_len = 0;
@@ -3542,9 +3542,10 @@ static int tdefl_flush_block(tdefl_compressor *d, int flush) {
   mz_uint saved_bit_buf, saved_bits_in;
   mz_uint8 *pSaved_output_buf;
   mz_bool comp_block_succeeded = MZ_FALSE;
-  int n, use_raw_block =
-             ((d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS) != 0) &&
-             (d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size;
+  int n,
+      use_raw_block =
+          ((d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS) != 0) &&
+          (d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size;
   mz_uint8 *pOutput_buf_start =
       ((d->m_pPut_buf_func == NULL) &&
        ((*d->m_pOut_buf_size - d->m_out_buf_ofs) >= TDEFL_OUT_BUF_SIZE))
@@ -3574,8 +3575,9 @@ static int tdefl_flush_block(tdefl_compressor *d, int flush) {
 
   if (!use_raw_block)
     comp_block_succeeded =
-        tdefl_compress_block(d, (d->m_flags & TDEFL_FORCE_ALL_STATIC_BLOCKS) ||
-                                    (d->m_total_lz_bytes < 48));
+        tdefl_compress_block(d,
+                             (d->m_flags & TDEFL_FORCE_ALL_STATIC_BLOCKS) ||
+                                 (d->m_total_lz_bytes < 48));
 
   // If the block gets expanded, forget the current contents of the output
   // buffer and send a raw block instead.
@@ -4519,10 +4521,7 @@ void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h,
 #include <stdio.h>
 #include <sys/stat.h>
 
-// -- GODOT change for old MinGW on Travis CI --
-//#if defined(_MSC_VER) || defined(__MINGW64__)
-#if defined(_MSC_VER) || (defined(__MINGW32__) && __MINGW64_VERSION_MAJOR >= 3)
-// -- GODOT end --
+#if defined(_MSC_VER) || defined(__MINGW64__)
 static FILE *mz_fopen(const char *pFilename, const char *pMode) {
   FILE *pFile = NULL;
   fopen_s(&pFile, pFilename, pMode);
@@ -5223,9 +5222,10 @@ mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index,
   n = MZ_READ_LE16(p + MZ_ZIP_CDH_COMMENT_LEN_OFS);
   n = MZ_MIN(n, MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE - 1);
   pStat->m_comment_size = n;
-  memcpy(pStat->m_comment, p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE +
-                               MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS) +
-                               MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS),
+  memcpy(pStat->m_comment,
+         p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE +
+             MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS) +
+             MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS),
          n);
   pStat->m_comment[n] = '\0';
 
@@ -6883,6 +6883,12 @@ void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename,
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+
 }
 #else
 
@@ -7346,11 +7352,23 @@ static void CompressZip(unsigned char *dst,
 
   compressedSize = outSize;
 #endif
+
+  // Use uncompressed data when compressed data is larger than uncompressed.
+  // (Issue 40)
+  if (compressedSize >= src_size) {
+    compressedSize = src_size;
+    memcpy(dst, src, src_size);
+  }
 }
 
 static void DecompressZip(unsigned char *dst,
                           unsigned long *uncompressed_size /* inout */,
                           const unsigned char *src, unsigned long src_size) {
+  if ((*uncompressed_size) == src_size) {
+    // Data is not compressed(Issue 40).
+    memcpy(dst, src, src_size);
+    return;
+  }
   std::vector<unsigned char> tmpBuf(*uncompressed_size);
 
 #if TINYEXR_USE_MINIZ
@@ -7410,6 +7428,22 @@ static void DecompressZip(unsigned char *dst,
 #pragma clang diagnostic ignored "-Wsign-conversion"
 #endif
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4204)  // nonstandard extension used : non-constant
+                                 // aggregate initializer (also supported by GNU
+                                 // C and C99, so no big deal)
+#pragma warning(disable : 4244)  // 'initializing': conversion from '__int64' to
+                                 // 'int', possible loss of data
+#pragma warning( \
+    disable : 4267)  // 'argument': conversion from '__int64' to 'int',
+                     // possible loss of data
+#pragma warning(disable : 4996)  // 'strdup': The POSIX name for this item is
+                                 // deprecated. Instead, use the ISO C and C++
+                                 // conformant name: _strdup.
+#endif
+
+
 const int MIN_RUN_LENGTH = 3;
 const int MAX_RUN_LENGTH = 127;
 
@@ -7502,6 +7536,7 @@ static int rleUncompress(int inLength, int maxLength, const signed char in[],
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
+
 // End of RLE code from OpenEXR -----------------------------------
 
 static void CompressRle(unsigned char *dst,
@@ -7562,11 +7597,24 @@ static void CompressRle(unsigned char *dst,
   assert(outSize > 0);
 
   compressedSize = static_cast<tinyexr::tinyexr_uint64>(outSize);
+
+  // Use uncompressed data when compressed data is larger than uncompressed.
+  // (Issue 40)
+  if (compressedSize >= src_size) {
+    compressedSize = src_size;
+    memcpy(dst, src, src_size);
+  }
 }
 
 static void DecompressRle(unsigned char *dst,
                           const unsigned long uncompressed_size,
                           const unsigned char *src, unsigned long src_size) {
+  if (uncompressed_size == src_size) {
+    // Data is not compressed(Issue 40).
+    memcpy(dst, src, src_size);
+    return;
+  }
+
   std::vector<unsigned char> tmpBuf(uncompressed_size);
 
   int ret = rleUncompress(static_cast<int>(src_size),
@@ -8882,7 +8930,12 @@ static void applyLut(const unsigned short lut[USHORT_RANGE],
 #pragma clang diagnostic pop
 #endif  // __clang__
 
-static bool CompressPiz(unsigned char *outPtr, unsigned int &outSize,
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+
+static bool CompressPiz(unsigned char *outPtr, unsigned int *outSize,
                         const unsigned char *inPtr, size_t inSize,
                         const std::vector<ChannelInfo> &channelInfo,
                         int data_width, int num_lines) {
@@ -8989,16 +9042,29 @@ static bool CompressPiz(unsigned char *outPtr, unsigned int &outSize,
       hufCompress(&tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()), buf);
   memcpy(lengthPtr, &length, sizeof(int));
 
-  outSize = static_cast<unsigned int>(
+  (*outSize) = static_cast<unsigned int>(
       (reinterpret_cast<unsigned char *>(buf) - outPtr) +
       static_cast<unsigned int>(length));
+
+  // Use uncompressed data when compressed data is larger than uncompressed.
+  // (Issue 40)
+  if ((*outSize) >= inSize) {
+    (*outSize) = static_cast<unsigned int>(inSize);
+    memcpy(outPtr, inPtr, inSize);
+  }
   return true;
 }
 
 static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
-                          size_t tmpBufSize, int num_channels,
+                          size_t tmpBufSize, size_t inLen, int num_channels,
                           const EXRChannelInfo *channels, int data_width,
                           int num_lines) {
+  if (inLen == tmpBufSize) {
+    // Data is not compressed(Issue 40).
+    memcpy(outPtr, inPtr, inLen);
+    return true;
+  }
+
   unsigned char bitmap[BITMAP_SIZE];
   unsigned short minNonZero;
   unsigned short maxNonZero;
@@ -9173,6 +9239,11 @@ static bool DecompressZfp(float *dst, int dst_width, int dst_num_lines,
                           const ZFPCompressionParam &param) {
   size_t uncompressed_size = dst_width * dst_num_lines * num_channels;
 
+  if (uncompressed_size == src_size) {
+    // Data is not compressed(Issue 40).
+    memcpy(dst, src, src_size);
+  }
+
   zfp_stream *zfp = NULL;
   zfp_field *field = NULL;
 
@@ -9317,12 +9388,11 @@ static void DecodePixelData(/* out */ unsigned char **out_images,
     // Allocate original data size.
     std::vector<unsigned char> outBuf(static_cast<size_t>(
         static_cast<size_t>(width * num_lines) * pixel_data_size));
-    size_t tmpBufLen = static_cast<size_t>(
-        static_cast<size_t>(width * num_lines) * pixel_data_size);
+    size_t tmpBufLen = outBuf.size();
 
     bool ret = tinyexr::DecompressPiz(
         reinterpret_cast<unsigned char *>(&outBuf.at(0)), data_ptr, tmpBufLen,
-        static_cast<int>(num_channels), channels, width, num_lines);
+        data_len, static_cast<int>(num_channels), channels, width, num_lines);
 
     assert(ret);
     (void)ret;
@@ -10047,8 +10117,7 @@ static int ParseEXRHeader(HeaderInfo *info, bool *empty_header,
 
     } else if (attr_name.compare("compression") == 0) {
       bool ok = false;
-      if ((data[0] >= TINYEXR_COMPRESSIONTYPE_NONE) &&
-          (data[0] < TINYEXR_COMPRESSIONTYPE_PIZ)) {
+      if (data[0] < TINYEXR_COMPRESSIONTYPE_PIZ) {
         ok = true;
       }
 
@@ -10158,9 +10227,14 @@ static int ParseEXRHeader(HeaderInfo *info, bool *empty_header,
       // Custom attribute(up to TINYEXR_MAX_ATTRIBUTES)
       if (info->attributes.size() < TINYEXR_MAX_ATTRIBUTES) {
         EXRAttribute attrib;
+#ifdef _MSC_VER
+        strncpy_s(attrib.name, attr_name.c_str(), 255);
+        strncpy_s(attrib.type, attr_type.c_str(), 255);
+#else
         strncpy(attrib.name, attr_name.c_str(), 255);
-        attrib.name[255] = '\0';
         strncpy(attrib.type, attr_type.c_str(), 255);
+#endif
+        attrib.name[255] = '\0';
         attrib.type[255] = '\0';
         attrib.size = static_cast<int>(data.size());
         attrib.value = static_cast<unsigned char *>(malloc(data.size()));
@@ -10254,8 +10328,12 @@ static void ConvertHeader(EXRHeader *exr_header, const HeaderInfo &info) {
   exr_header->channels = static_cast<EXRChannelInfo *>(malloc(
       sizeof(EXRChannelInfo) * static_cast<size_t>(exr_header->num_channels)));
   for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+#ifdef _MSC_VER
+    strncpy_s(exr_header->channels[c].name, info.channels[c].name.c_str(), 255);
+#else
     strncpy(exr_header->channels[c].name, info.channels[c].name.c_str(), 255);
-    // manually add '\0' for safety.
+#endif
+	// manually add '\0' for safety.
     exr_header->channels[c].name[255] = '\0';
 
     exr_header->channels[c].pixel_type = info.channels[c].pixel_type;
@@ -10317,6 +10395,8 @@ static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
                                 &channel_offset, num_channels,
                                 exr_header->channels);
 
+  bool invalid_data = false;
+
   if (exr_header->tiled) {
     size_t num_tiles = offsets.size();  // = # of blocks
 
@@ -10411,18 +10491,26 @@ static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
       // Adjust line_no with data_window.bmin.y
       line_no -= exr_header->data_window[1];
 
-      tinyexr::DecodePixelData(
-          exr_image->images, exr_header->requested_pixel_types, data_ptr,
-          static_cast<size_t>(data_len), exr_header->compression_type,
-          exr_header->line_order, data_width, data_height, data_width, y,
-          line_no, num_lines, static_cast<size_t>(pixel_data_size),
-          static_cast<size_t>(exr_header->num_custom_attributes),
-          exr_header->custom_attributes,
-          static_cast<size_t>(exr_header->num_channels), exr_header->channels,
-          channel_offset_list);
+      if (line_no < 0) {
+        invalid_data = true;
+      } else {
+        tinyexr::DecodePixelData(
+            exr_image->images, exr_header->requested_pixel_types, data_ptr,
+            static_cast<size_t>(data_len), exr_header->compression_type,
+            exr_header->line_order, data_width, data_height, data_width, y,
+            line_no, num_lines, static_cast<size_t>(pixel_data_size),
+            static_cast<size_t>(exr_header->num_custom_attributes),
+            exr_header->custom_attributes,
+            static_cast<size_t>(exr_header->num_channels), exr_header->channels,
+            channel_offset_list);
+      }
     }  // omp parallel
   }
 
+  if (invalid_data) {
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
   // Overwrite `pixel_type` with `requested_pixel_type`.
   {
     for (int c = 0; c < exr_header->num_channels; c++) {
@@ -10641,46 +10729,63 @@ int LoadEXR(float **out_rgba, int *width, int *height, const char *filename,
     }
   }
 
-  if (idxR == -1) {
-    if (err) {
-      (*err) = "R channel not found\n";
+  if ((idxA == 0) && (idxR == -1) && (idxG == -1) && (idxB == -1)) {
+    // Alpha channel only.
+
+    (*out_rgba) = reinterpret_cast<float *>(
+        malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+               static_cast<size_t>(exr_image.height)));
+    for (int i = 0; i < exr_image.width * exr_image.height; i++) {
+      const float val = reinterpret_cast<float **>(exr_image.images)[0][i];
+      (*out_rgba)[4 * i + 0] = val;
+      (*out_rgba)[4 * i + 1] = val;
+      (*out_rgba)[4 * i + 2] = val;
+      (*out_rgba)[4 * i + 3] = val;
     }
+  } else {
+    // Assume RGB(A)
 
-    // @todo { free exr_image }
-    return TINYEXR_ERROR_INVALID_DATA;
-  }
+    if (idxR == -1) {
+      if (err) {
+        (*err) = "R channel not found\n";
+      }
 
-  if (idxG == -1) {
-    if (err) {
-      (*err) = "G channel not found\n";
+      // @todo { free exr_image }
+      return TINYEXR_ERROR_INVALID_DATA;
     }
-    // @todo { free exr_image }
-    return TINYEXR_ERROR_INVALID_DATA;
-  }
 
-  if (idxB == -1) {
-    if (err) {
-      (*err) = "B channel not found\n";
+    if (idxG == -1) {
+      if (err) {
+        (*err) = "G channel not found\n";
+      }
+      // @todo { free exr_image }
+      return TINYEXR_ERROR_INVALID_DATA;
     }
-    // @todo { free exr_image }
-    return TINYEXR_ERROR_INVALID_DATA;
-  }
 
-  (*out_rgba) = reinterpret_cast<float *>(
-      malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
-             static_cast<size_t>(exr_image.height)));
-  for (int i = 0; i < exr_image.width * exr_image.height; i++) {
-    (*out_rgba)[4 * i + 0] =
-        reinterpret_cast<float **>(exr_image.images)[idxR][i];
-    (*out_rgba)[4 * i + 1] =
-        reinterpret_cast<float **>(exr_image.images)[idxG][i];
-    (*out_rgba)[4 * i + 2] =
-        reinterpret_cast<float **>(exr_image.images)[idxB][i];
-    if (idxA != -1) {
-      (*out_rgba)[4 * i + 3] =
-          reinterpret_cast<float **>(exr_image.images)[idxA][i];
-    } else {
-      (*out_rgba)[4 * i + 3] = 1.0;
+    if (idxB == -1) {
+      if (err) {
+        (*err) = "B channel not found\n";
+      }
+      // @todo { free exr_image }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    (*out_rgba) = reinterpret_cast<float *>(
+        malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+               static_cast<size_t>(exr_image.height)));
+    for (int i = 0; i < exr_image.width * exr_image.height; i++) {
+      (*out_rgba)[4 * i + 0] =
+          reinterpret_cast<float **>(exr_image.images)[idxR][i];
+      (*out_rgba)[4 * i + 1] =
+          reinterpret_cast<float **>(exr_image.images)[idxG][i];
+      (*out_rgba)[4 * i + 2] =
+          reinterpret_cast<float **>(exr_image.images)[idxB][i];
+      if (idxA != -1) {
+        (*out_rgba)[4 * i + 3] =
+            reinterpret_cast<float **>(exr_image.images)[idxA][i];
+      } else {
+        (*out_rgba)[4 * i + 3] = 1.0;
+      }
     }
   }
 
@@ -10720,7 +10825,11 @@ int ParseEXRHeaderFromMemory(EXRHeader *exr_header, const EXRVersion *version,
 
   if (ret != TINYEXR_SUCCESS) {
     if (err && !err_str.empty()) {
+#ifdef _WIN32
+      (*err) = _strdup(err_str.c_str());  // May leak
+#else
       (*err) = strdup(err_str.c_str());  // May leak
+#endif
     }
   }
 
@@ -10732,8 +10841,9 @@ int ParseEXRHeaderFromMemory(EXRHeader *exr_header, const EXRVersion *version,
   return ret;
 }
 
-int LoadEXRFromMemory(float *out_rgba, const unsigned char *memory, size_t size,
-                      const char **err) {
+int LoadEXRFromMemory(float **out_rgba, int *width, int *height,
+	const unsigned char *memory, size_t size,
+	const char **err) {
   if (out_rgba == NULL || memory == NULL) {
     if (err) {
       (*err) = "Invalid argument.\n";
@@ -10756,6 +10866,13 @@ int LoadEXRFromMemory(float *out_rgba, const unsigned char *memory, size_t size,
   if (ret != TINYEXR_SUCCESS) {
     return ret;
   }
+  
+  // Read HALF channel as FLOAT.
+  for (int i = 0; i < exr_header.num_channels; i++) {
+    if (exr_header.pixel_types[i] == TINYEXR_PIXELTYPE_HALF) {
+      exr_header.requested_pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT;
+    }
+  }  
 
   InitEXRImage(&exr_image);
   ret = LoadEXRImageFromMemory(&exr_image, &exr_header, memory, size, err);
@@ -10805,19 +10922,32 @@ int LoadEXRFromMemory(float *out_rgba, const unsigned char *memory, size_t size,
     return TINYEXR_ERROR_INVALID_DATA;
   }
 
-  // Assume `out_rgba` have enough memory allocated.
+  (*out_rgba) = reinterpret_cast<float *>(
+	malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+	  static_cast<size_t>(exr_image.height)));
+
   for (int i = 0; i < exr_image.width * exr_image.height; i++) {
-    out_rgba[4 * i + 0] = reinterpret_cast<float **>(exr_image.images)[idxR][i];
-    out_rgba[4 * i + 1] = reinterpret_cast<float **>(exr_image.images)[idxG][i];
-    out_rgba[4 * i + 2] = reinterpret_cast<float **>(exr_image.images)[idxB][i];
-    if (idxA > 0) {
-      out_rgba[4 * i + 3] =
-          reinterpret_cast<float **>(exr_image.images)[idxA][i];
-    } else {
-      out_rgba[4 * i + 3] = 1.0;
-    }
+	(*out_rgba)[4 * i + 0] =
+	 reinterpret_cast<float **>(exr_image.images)[idxR][i];
+	(*out_rgba)[4 * i + 1] =
+	 reinterpret_cast<float **>(exr_image.images)[idxG][i];
+	(*out_rgba)[4 * i + 2] =
+	 reinterpret_cast<float **>(exr_image.images)[idxB][i];
+	if (idxA != -1) {
+	 (*out_rgba)[4 * i + 3] =
+	  reinterpret_cast<float **>(exr_image.images)[idxA][i];
+	}
+	else {
+	 (*out_rgba)[4 * i + 3] = 1.0;
+	}
   }
 
+  (*width) = exr_image.width;
+  (*height) = exr_image.height;
+
+  FreeEXRHeader(&exr_header);
+  FreeEXRImage(&exr_image);
+
   return TINYEXR_SUCCESS;
 }
 
@@ -10830,10 +10960,7 @@ int LoadEXRImageFromFile(EXRImage *exr_image, const EXRHeader *exr_header,
     return TINYEXR_ERROR_INVALID_ARGUMENT;
   }
 
-// -- GODOT change for old MinGW on Travis CI --
-//#ifdef _WIN32
-#if defined(_MSC_VER) || (defined(__MINGW32__) && __MINGW64_VERSION_MAJOR >= 3)
-// -- GODOT end --
+#ifdef _WIN32
   FILE *fp = NULL;
   fopen_s(&fp, filename, "rb");
 #else
@@ -11315,7 +11442,7 @@ size_t SaveEXRImageToMemory(const EXRImage *exr_image,
       std::vector<unsigned char> block(bufLen);
       unsigned int outSize = static_cast<unsigned int>(block.size());
 
-      CompressPiz(&block.at(0), outSize,
+      CompressPiz(&block.at(0), &outSize,
                   reinterpret_cast<const unsigned char *>(&buf.at(0)),
                   buf.size(), channels, exr_image->width, h);
 
@@ -11422,10 +11549,7 @@ int SaveEXRImageToFile(const EXRImage *exr_image, const EXRHeader *exr_header,
   }
 #endif
 
-// -- GODOT change for old MinGW on Travis CI --
-//#ifdef _WIN32
-#if defined(_MSC_VER) || (defined(__MINGW32__) && __MINGW64_VERSION_MAJOR >= 3)
-// -- GODOT end --
+#ifdef _WIN32
   FILE *fp = NULL;
   fopen_s(&fp, filename, "wb");
 #else
@@ -11459,6 +11583,16 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
     return TINYEXR_ERROR_INVALID_ARGUMENT;
   }
 
+#ifdef _MSC_VER
+  FILE *fp = NULL;
+  errno_t errcode = fopen_s(&fp, filename, "rb");
+  if ((!errcode) || (!fp)) {
+    if (err) {
+      (*err) = "Cannot read file.";
+    }
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+#else
   FILE *fp = fopen(filename, "rb");
   if (!fp) {
     if (err) {
@@ -11466,6 +11600,7 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
     }
     return TINYEXR_ERROR_CANT_OPEN_FILE;
   }
+#endif
 
   size_t filesize;
   // Compute size
@@ -11535,6 +11670,7 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
     if (0 == size) {
       return TINYEXR_ERROR_INVALID_DATA;
     } else if (marker[0] == '\0') {
+      marker++;
       size--;
       break;
     }
@@ -11724,11 +11860,13 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
     // decode sample data.
     {
       unsigned long dstLen = static_cast<unsigned long>(unpackedSampleDataSize);
-      tinyexr::DecompressZip(
-          reinterpret_cast<unsigned char *>(&sample_data.at(0)), &dstLen,
-          data_ptr + 28 + packedOffsetTableSize,
-          static_cast<unsigned long>(packedSampleDataSize));
-      assert(dstLen == static_cast<unsigned long>(unpackedSampleDataSize));
+      if (dstLen) {
+        tinyexr::DecompressZip(
+            reinterpret_cast<unsigned char *>(&sample_data.at(0)), &dstLen,
+            data_ptr + 28 + packedOffsetTableSize,
+            static_cast<unsigned long>(packedSampleDataSize));
+        assert(dstLen == static_cast<unsigned long>(unpackedSampleDataSize));
+      }
     }
 
     // decode sample
@@ -11774,7 +11912,7 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
         if (channels[c].pixel_type == 0) {  // UINT
           for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
             unsigned int ui = *reinterpret_cast<unsigned int *>(
-                &sample_data.at(data_offset + x * sizeof(int)));
+                &sample_data.at(size_t(data_offset) + x * sizeof(int)));
             deep_image->image[c][y][x] = static_cast<float>(ui);  // @fixme
           }
           data_offset +=
@@ -11783,7 +11921,7 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
           for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
             tinyexr::FP16 f16;
             f16.u = *reinterpret_cast<unsigned short *>(
-                &sample_data.at(data_offset + x * sizeof(short)));
+                &sample_data.at(size_t(data_offset) + x * sizeof(short)));
             tinyexr::FP32 f32 = half_to_float(f16);
             deep_image->image[c][y][x] = f32.f;
           }
@@ -11791,7 +11929,7 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
         } else {  // float
           for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
             float f = *reinterpret_cast<float *>(
-                &sample_data.at(data_offset + x * sizeof(float)));
+                &sample_data.at(size_t(data_offset) + x * sizeof(float)));
             deep_image->image[c][y][x] = f;
           }
           data_offset += sizeof(float) * static_cast<size_t>(samples_per_line);
@@ -11906,10 +12044,7 @@ int ParseEXRHeaderFromFile(EXRHeader *exr_header, const EXRVersion *exr_version,
     return TINYEXR_ERROR_INVALID_ARGUMENT;
   }
 
-// -- GODOT change for old MinGW on Travis CI --
-//#ifdef _WIN32
-#if defined(_MSC_VER) || (defined(__MINGW32__) && __MINGW64_VERSION_MAJOR >= 3)
-// -- GODOT end --
+#ifdef _WIN32
   FILE *fp = NULL;
   fopen_s(&fp, filename, "rb");
 #else
@@ -11978,7 +12113,11 @@ int ParseEXRMultipartHeaderFromMemory(EXRHeader ***exr_headers,
 
     if (ret != TINYEXR_SUCCESS) {
       if (err) {
+#ifdef _WIN32
+        (*err) = _strdup(err_str.c_str());  // may leak
+#else
         (*err) = strdup(err_str.c_str());  // may leak
+#endif
       }
       return ret;
     }
@@ -12033,10 +12172,7 @@ int ParseEXRMultipartHeaderFromFile(EXRHeader ***exr_headers, int *num_headers,
     return TINYEXR_ERROR_INVALID_ARGUMENT;
   }
 
-// -- GODOT change for old MinGW on Travis CI --
-//#ifdef _WIN32
-#if defined(_MSC_VER) || (defined(__MINGW32__) && __MINGW64_VERSION_MAJOR >= 3)
-// -- GODOT end --
+#ifdef _WIN32
   FILE *fp = NULL;
   fopen_s(&fp, filename, "rb");
 #else
@@ -12136,10 +12272,7 @@ int ParseEXRVersionFromFile(EXRVersion *version, const char *filename) {
     return TINYEXR_ERROR_INVALID_ARGUMENT;
   }
 
-// -- GODOT change for old MinGW on Travis CI --
-//#ifdef _WIN32
-#if defined(_MSC_VER) || (defined(__MINGW32__) && __MINGW64_VERSION_MAJOR >= 3)
-// -- GODOT end --
+#ifdef _WIN32
   FILE *fp = NULL;
   fopen_s(&fp, filename, "rb");
 #else
@@ -12277,10 +12410,7 @@ int LoadEXRMultipartImageFromFile(EXRImage *exr_images,
     return TINYEXR_ERROR_INVALID_ARGUMENT;
   }
 
-// -- GODOT change for old MinGW on Travis CI --
-//#ifdef _WIN32
-#if defined(_MSC_VER) || (defined(__MINGW32__) && __MINGW64_VERSION_MAJOR >= 3)
-// -- GODOT end --
+#ifdef _WIN32
   FILE *fp = NULL;
   fopen_s(&fp, filename, "rb");
 #else
@@ -12313,8 +12443,8 @@ int LoadEXRMultipartImageFromFile(EXRImage *exr_images,
 }
 
 int SaveEXR(const float *data, int width, int height, int components,
-            const char *outfilename) {
-  if (components == 3 || components == 4) {
+            const int save_as_fp16, const char *outfilename) {
+  if ((components == 1) || components == 3 || components == 4) {
     // OK
   } else {
     return TINYEXR_ERROR_INVALID_ARGUMENT;
@@ -12333,18 +12463,24 @@ int SaveEXR(const float *data, int width, int height, int components,
   image.num_channels = components;
 
   std::vector<float> images[4];
-  images[0].resize(static_cast<size_t>(width * height));
-  images[1].resize(static_cast<size_t>(width * height));
-  images[2].resize(static_cast<size_t>(width * height));
-  images[3].resize(static_cast<size_t>(width * height));
 
-  // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
-  for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
-    images[0][i] = data[static_cast<size_t>(components) * i + 0];
-    images[1][i] = data[static_cast<size_t>(components) * i + 1];
-    images[2][i] = data[static_cast<size_t>(components) * i + 2];
-    if (components == 4) {
-      images[3][i] = data[static_cast<size_t>(components) * i + 3];
+  if (components == 1) {
+    images[0].resize(static_cast<size_t>(width * height));
+    memcpy(images[0].data(), data, sizeof(float) * size_t(width * height));
+  } else {
+    images[0].resize(static_cast<size_t>(width * height));
+    images[1].resize(static_cast<size_t>(width * height));
+    images[2].resize(static_cast<size_t>(width * height));
+    images[3].resize(static_cast<size_t>(width * height));
+
+    // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
+    for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
+      images[0][i] = data[static_cast<size_t>(components) * i + 0];
+      images[1][i] = data[static_cast<size_t>(components) * i + 1];
+      images[2][i] = data[static_cast<size_t>(components) * i + 2];
+      if (components == 4) {
+        images[3][i] = data[static_cast<size_t>(components) * i + 3];
+      }
     }
   }
 
@@ -12354,10 +12490,12 @@ int SaveEXR(const float *data, int width, int height, int components,
     image_ptr[1] = &(images[2].at(0));  // B
     image_ptr[2] = &(images[1].at(0));  // G
     image_ptr[3] = &(images[0].at(0));  // R
-  } else {
+  } else if (components == 3) {
     image_ptr[0] = &(images[2].at(0));  // B
     image_ptr[1] = &(images[1].at(0));  // G
     image_ptr[2] = &(images[0].at(0));  // R
+  } else if (components == 1) {
+    image_ptr[0] = &(images[0].at(0));  // A
   }
 
   image.images = reinterpret_cast<unsigned char **>(image_ptr);
@@ -12369,21 +12507,41 @@ int SaveEXR(const float *data, int width, int height, int components,
       sizeof(EXRChannelInfo) * static_cast<size_t>(header.num_channels)));
   // Must be (A)BGR order, since most of EXR viewers expect this channel order.
   if (components == 4) {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "A", 255);
+    strncpy_s(header.channels[1].name, "B", 255);
+    strncpy_s(header.channels[2].name, "G", 255);
+    strncpy_s(header.channels[3].name, "R", 255);
+#else
     strncpy(header.channels[0].name, "A", 255);
-    header.channels[0].name[strlen("A")] = '\0';
     strncpy(header.channels[1].name, "B", 255);
-    header.channels[1].name[strlen("B")] = '\0';
     strncpy(header.channels[2].name, "G", 255);
-    header.channels[2].name[strlen("G")] = '\0';
     strncpy(header.channels[3].name, "R", 255);
+#endif
+    header.channels[0].name[strlen("A")] = '\0';
+    header.channels[1].name[strlen("B")] = '\0';
+    header.channels[2].name[strlen("G")] = '\0';
     header.channels[3].name[strlen("R")] = '\0';
-  } else {
+  } else if (components == 3) {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "B", 255);
+    strncpy_s(header.channels[1].name, "G", 255);
+    strncpy_s(header.channels[2].name, "R", 255);
+#else
     strncpy(header.channels[0].name, "B", 255);
-    header.channels[0].name[strlen("B")] = '\0';
     strncpy(header.channels[1].name, "G", 255);
-    header.channels[1].name[strlen("G")] = '\0';
     strncpy(header.channels[2].name, "R", 255);
+#endif
+    header.channels[0].name[strlen("B")] = '\0';
+    header.channels[1].name[strlen("G")] = '\0';
     header.channels[2].name[strlen("R")] = '\0';
+  } else {
+#ifdef _MSC_VER
+    strncpy_s(header.channels[0].name, "A", 255);
+#else
+    strncpy(header.channels[0].name, "A", 255);
+#endif
+    header.channels[0].name[strlen("A")] = '\0';
   }
 
   header.pixel_types = static_cast<int *>(
@@ -12393,9 +12551,15 @@ int SaveEXR(const float *data, int width, int height, int components,
   for (int i = 0; i < header.num_channels; i++) {
     header.pixel_types[i] =
         TINYEXR_PIXELTYPE_FLOAT;  // pixel type of input image
-    header.requested_pixel_types[i] =
-        TINYEXR_PIXELTYPE_HALF;  // pixel type of output image to be stored in
-                                 // .EXR
+
+    if (save_as_fp16 > 0) {
+      header.requested_pixel_types[i] =
+          TINYEXR_PIXELTYPE_HALF;  // save with half(fp16) pixel format
+    } else {
+      header.requested_pixel_types[i] =
+          TINYEXR_PIXELTYPE_FLOAT;  // save with float(fp32) pixel format(i.e.
+                                    // no precision reduction)
+    }
   }
 
   const char *err;
@@ -12411,9 +12575,5 @@ int SaveEXR(const float *data, int width, int height, int components,
   return ret;
 }
 
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
 #endif  // TINYEXR_IMPLEMENTATION_DEIFNED
 #endif  // TINYEXR_IMPLEMENTATION